Add prebuilt llvm-rs-cc for older versions Test: make ENABLE_RSTESTFORWARD RSTestForward All the RSTestForward_#_#_#.apk are all successfully built using these prebuilts. Change-Id: Ie554a3c03d74b4118743014181ce42c1370cb8d3

commit: 68a0a1ddacb81c97d718f46ad464a3851d0b67af [log] [tgz]
author: Arthur Eubanks <aeubanks@google.com> Thu May 11 11:44:34 2017 -0700
committer: Arthur Eubanks <aeubanks@google.com> Thu May 11 11:44:34 2017 -0700
tree: 7d5802a461720a0329c78549e91f12c902baf8af
parent: bad4a4a7b6a3d0e77bfb2e30c43f68a3f681d245 [diff]
diff --git a/21.1.2/bin/llvm-rs-cc b/21.1.2/bin/llvm-rs-cc
new file mode 100755
index 0000000..1d47804
--- /dev/null
+++ b/21.1.2/bin/llvm-rs-cc
Binary files differ

diff --git a/21.1.2/clang-include/CMakeLists.txt b/21.1.2/clang-include/CMakeLists.txt
new file mode 100644
index 0000000..edee7d7
--- /dev/null
+++ b/21.1.2/clang-include/CMakeLists.txt

@@ -0,0 +1,81 @@
+set(files
+  altivec.h
+  ammintrin.h
+  arm_acle.h
+  avxintrin.h
+  avx2intrin.h
+  bmiintrin.h
+  bmi2intrin.h
+  emmintrin.h
+  f16cintrin.h
+  float.h
+  fma4intrin.h
+  fmaintrin.h
+  ia32intrin.h
+  immintrin.h
+  iso646.h
+  Intrin.h
+  limits.h
+  lzcntintrin.h
+  mm3dnow.h
+  mmintrin.h
+  mm_malloc.h
+  nmmintrin.h
+  pmmintrin.h
+  popcntintrin.h
+  prfchwintrin.h
+  rdseedintrin.h
+  rtmintrin.h
+  shaintrin.h
+  smmintrin.h
+  stdalign.h
+  stdarg.h
+  stdbool.h
+  stddef.h
+  stdint.h
+  stdnoreturn.h
+  tbmintrin.h
+  tgmath.h
+  tmmintrin.h
+  varargs.h
+  wmmintrin.h
+  __wmmintrin_aes.h
+  __wmmintrin_pclmul.h
+  x86intrin.h
+  xmmintrin.h
+  xopintrin.h
+  cpuid.h
+  unwind.h
+  module.modulemap
+  )
+
+set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
+
+# Generate arm_neon.h
+clang_tablegen(arm_neon.h -gen-arm-neon
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+
+set(out_files)
+foreach( f ${files} )
+  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
+  set( dst ${output_dir}/${f} )
+  add_custom_command(OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+    COMMENT "Copying clang's ${f}...")
+  list(APPEND out_files ${dst})
+endforeach( f )
+
+add_custom_command(OUTPUT ${output_dir}/arm_neon.h 
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
+  COMMENT "Copying clang's arm_neon.h...")
+list(APPEND out_files ${output_dir}/arm_neon.h)
+
+add_custom_target(clang-headers ALL DEPENDS ${out_files})
+set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
+
+install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)

diff --git a/21.1.2/clang-include/Intrin.h b/21.1.2/clang-include/Intrin.h
new file mode 100644
index 0000000..13e105e
--- /dev/null
+++ b/21.1.2/clang-include/Intrin.h

@@ -0,0 +1,1014 @@
+/* ===-------- Intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <Intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+__m64 _m_from_int(int _l);
+void _m_prefetch(void *);
+float _m_to_float(__m64);
+int _m_to_int(__m64 _M);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+void __debugbreak(void);
+__int64 __emul(int, int);
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+static __inline__
+unsigned int __popcnt(unsigned int);
+static __inline__
+unsigned short __popcnt16(unsigned short);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned long __readfsdword(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+unsigned int _andn_u32(unsigned int, unsigned int);
+unsigned int _bextr_u32(unsigned int, unsigned int, unsigned int);
+unsigned int _bextri_u32(unsigned int, unsigned int);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+unsigned int _blcfill_u32(unsigned int);
+unsigned int _blci_u32(unsigned int);
+unsigned int _blcic_u32(unsigned int);
+unsigned int _blcmsk_u32(unsigned int);
+unsigned int _blcs_u32(unsigned int);
+unsigned int _blsfill_u32(unsigned int);
+unsigned int _blsi_u32(unsigned int);
+unsigned int _blsic_u32(unsigned int);
+unsigned int _blsmsk_u32(unsigned int);
+unsigned int _blsr_u32(unsigned int);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned long __cdecl _byteswap_ulong(unsigned long);
+unsigned short __cdecl _byteswap_ushort(unsigned short);
+unsigned _bzhi_u32(unsigned int, unsigned int);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+void __cdecl _fxrstor(void const *);
+void __cdecl _fxsave(void *);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+static __inline__
+long _InterlockedAnd(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedAnd16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedAnd8(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+static __inline__
+long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
+                                         long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+static __inline__
+short _InterlockedCompareExchange16(short volatile *_Destination,
+                                    short _Exchange, short _Comparand);
+static __inline__
+__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
+                                      __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+static __inline__
+char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
+                                  char _Comparand);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+static __inline__
+long __cdecl _InterlockedDecrement(long volatile *_Addend);
+static __inline__
+short _InterlockedDecrement16(short volatile *_Addend);
+long _InterlockedExchange(long volatile *_Target, long _Value);
+static __inline__
+short _InterlockedExchange16(short volatile *_Target, short _Value);
+static __inline__
+char _InterlockedExchange8(char volatile *_Target, char _Value);
+static __inline__
+long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+static __inline__
+short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+static __inline__
+char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
+static __inline__
+long __cdecl _InterlockedIncrement(long volatile *_Addend);
+static __inline__
+short _InterlockedIncrement16(short volatile *_Addend);
+static __inline__
+long _InterlockedOr(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedOr16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedOr8(char volatile *_Value, char _Mask);
+static __inline__
+long _InterlockedXor(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedXor16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedXor8(char volatile *_Value, char _Mask);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__
+unsigned long __cdecl _lrotl(unsigned long, int);
+static __inline__
+unsigned long __cdecl _lrotr(unsigned long, int);
+static __inline__
+unsigned int _lzcnt_u32(unsigned int);
+static __inline__
+void _ReadBarrier(void);
+static __inline__
+void _ReadWriteBarrier(void);
+static __inline__
+void *_ReturnAddress(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+int __cdecl _rdrand16_step(unsigned short *);
+int __cdecl _rdrand32_step(unsigned int *);
+static __inline__
+unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
+static __inline__
+unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+unsigned int _t1mskc_u32(unsigned int);
+unsigned int _tzcnt_u32(unsigned int);
+unsigned int _tzmsk_u32(unsigned int);
+static __inline__
+void _WriteBarrier(void);
+void _xabort(const unsigned int imm);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xrstor(void const *, unsigned __int64);
+void __cdecl _xsave(void *, unsigned __int64);
+void __cdecl _xsaveopt(void *, unsigned __int64);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+unsigned char _xtest(void);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __popcnt64(unsigned __int64);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+unsigned __int64 _andn_u64(unsigned __int64, unsigned __int64);
+unsigned __int64 _bextr_u64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 _bextri_u64(unsigned __int64, unsigned int);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+unsigned __int64 _blcfill_u64(unsigned __int64);
+unsigned __int64 _blci_u64(unsigned __int64);
+unsigned __int64 _blcic_u64(unsigned __int64);
+unsigned __int64 _blcmsk_u64(unsigned __int64);
+unsigned __int64 _blcs_u64(unsigned __int64);
+unsigned __int64 _blsfill_u64(unsigned __int64);
+unsigned __int64 _blsi_u64(unsigned __int64);
+unsigned __int64 _blsic_u64(unsigned __int64);
+unsigned __int64 _blsmsk_u64(unsigned __int64);
+unsigned __int64 _blsr_u64(unsigned __int64);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned __int64 _bzhi_u64(unsigned __int64, unsigned int);
+void __cdecl _fxrstor64(void const *);
+void __cdecl _fxsave64(void *);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
+                                         void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+static __inline__
+unsigned __int64 _lzcnt_u64(unsigned __int64);
+__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
+                __int64 *_HighProduct);
+unsigned int __cdecl _readfsbase_u32(void);
+unsigned __int64 __cdecl _readfsbase_u64(void);
+unsigned int __cdecl _readgsbase_u32(void);
+unsigned __int64 __cdecl _readgsbase_u64(void);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmpex(jmp_buf);
+#endif
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 shrx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _tzcnt_u64(unsigned __int64);
+unsigned __int64 _tzmsk_u64(unsigned __int64);
+unsigned __int64 _umul128(unsigned __int64 _Multiplier,
+                          unsigned __int64 _Multiplicand,
+                          unsigned __int64 *_HighProduct);
+void __cdecl _writefsbase_u32(unsigned int);
+void _cdecl _writefsbase_u64(unsigned __int64);
+void __cdecl _writegsbase_u32(unsigned int);
+void __cdecl _writegsbase_u64(unsigned __int64);
+void __cdecl _xrstor64(void const *, unsigned __int64);
+void __cdecl _xsave64(void *, unsigned __int64);
+void __cdecl _xsaveopt64(void *, unsigned __int64);
+
+#endif /* __x86_64__ */
+
+/*----------------------------------------------------------------------------*\
+|* Bit Twiddling
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_rotl8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_rotr8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_rotl16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_rotr16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_rotl(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_rotr(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+_lrotl(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+_lrotr(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_rotl64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_rotr64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 31 - __builtin_clzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_lzcnt_u32(unsigned int a) {
+  if (!a)
+    return 32;
+  return __builtin_clzl(a);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__popcnt16(unsigned short value) {
+  return __builtin_popcount((int)value);
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__popcnt(unsigned int value) {
+  return __builtin_popcount(value);
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittest(long const *a, long b) {
+  return (*a >> b) & 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandcomplement(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a ^ (1 << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandreset(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a & ~(1 << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandset(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a | (1 << b);
+  return x;
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_interlockedbittestandset(long volatile *__BitBase, long __BitPos) {
+  unsigned char __Res;
+  __asm__ ("xor %0, %0\n"
+           "lock bts %2, %1\n"
+           "setc %0\n"
+           : "=r" (__Res), "+m"(*__BitBase)
+           : "Ir"(__BitPos));
+  return __Res;
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzll(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 63 - __builtin_clzll(_Mask);
+  return 1;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_lzcnt_u64(unsigned __int64 a) {
+  if (!a)
+    return 64;
+  return __builtin_clzll(a);
+}
+static __inline__
+unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+ __popcnt64(unsigned __int64 value) {
+  return __builtin_popcountll(value);
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittest64(__int64 const *a, __int64 b) {
+  return (*a >> b) & 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandcomplement64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a ^ (1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandreset64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a & ~(1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandset64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a | (1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_interlockedbittestandset64(__int64 volatile *__BitBase, __int64 __BitPos) {
+  unsigned char __Res;
+  __asm__ ("xor %0, %0\n"
+           "lock bts %2, %1\n"
+           "setc %0\n"
+           : "=r" (__Res), "+m"(*__BitBase)
+           : "Ir"(__BitPos));
+  return __Res;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Sub
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedIncrement16(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedIncrement64(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedDecrement16(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedDecrement64(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd8(char volatile *_Value, char _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd16(short volatile *_Value, short _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd(long volatile *_Value, long _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr8(char volatile *_Value, char _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr16(short volatile *_Value, short _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr(long volatile *_Value, long _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor8(char volatile *_Value, char _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor16(short volatile *_Value, short _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor(long volatile *_Value, long _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange8(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange16(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange8(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange16(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange64(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+/*----------------------------------------------------------------------------*\
+|* Barriers
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__faststorefence(void) {
+  __asm__ volatile("lock orq $0, (%%rsp)" : : : "memory");
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__readfsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsh" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
+  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosh" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+_AddressOfReturnAddress(void) {
+  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
+}
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+_ReturnAddress(void) {
+  return __builtin_return_address(0);
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __attribute__((__always_inline__, __nodebug__))
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __attribute__((always_inline, __nodebug__))
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __attribute__((always_inline, __nodebug__))
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */

diff --git a/21.1.2/clang-include/LICENSE.TXT b/21.1.2/clang-include/LICENSE.TXT
new file mode 100644
index 0000000..3b1153d
--- /dev/null
+++ b/21.1.2/clang-include/LICENSE.TXT

@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2014 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+

diff --git a/21.1.2/clang-include/altivec.h b/21.1.2/clang-include/altivec.h
new file mode 100644
index 0000000..7a4a774
--- /dev/null
+++ b/21.1.2/clang-include/altivec.h

@@ -0,0 +1,12389 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ     0
+#define __CR6_EQ_REV 1
+#define __CR6_LT     2
+#define __CR6_LT_REV 3
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+static vector signed char __ATTRS_o_ai
+vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a,
+         vector unsigned char __b,
+         vector unsigned char __c);
+
+static vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c);
+
+static vector short __ATTRS_o_ai
+vec_perm(vector short __a, vector short __b, vector unsigned char __c);
+
+static vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned char __c);
+
+static vector bool short __ATTRS_o_ai
+vec_perm(vector bool short __a, vector bool short __b, vector unsigned char __c);
+
+static vector pixel __ATTRS_o_ai
+vec_perm(vector pixel __a, vector pixel __b, vector unsigned char __c);
+
+static vector int __ATTRS_o_ai
+vec_perm(vector int __a, vector int __b, vector unsigned char __c);
+
+static vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c);
+
+static vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
+
+static vector float __ATTRS_o_ai
+vec_perm(vector float __a, vector float __b, vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi  vec_abs
+#define __builtin_altivec_abs_v4si  vec_abs
+
+static vector signed char __ATTRS_o_ai
+vec_abs(vector signed char __a)
+{
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_abs(vector signed short __a)
+{
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static vector signed int __ATTRS_o_ai
+vec_abs(vector signed int __a)
+{
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+static vector float __ATTRS_o_ai
+vec_abs(vector float __a)
+{
+  vector unsigned int __res = (vector unsigned int)__a
+                            & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+}
+
+/* vec_abss */
+
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi  vec_abss
+#define __builtin_altivec_abss_v4si  vec_abss
+
+static vector signed char __ATTRS_o_ai
+vec_abss(vector signed char __a)
+{
+  return __builtin_altivec_vmaxsb
+           (__a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static vector signed short __ATTRS_o_ai
+vec_abss(vector signed short __a)
+{
+  return __builtin_altivec_vmaxsh
+           (__a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static vector signed int __ATTRS_o_ai
+vec_abss(vector signed int __a)
+{
+  return __builtin_altivec_vmaxsw
+           (__a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_add */
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector signed char __b)
+{
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector bool char __b)
+{
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector bool char __b)
+{
+  return __a + (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector short __a, vector short __b)
+{
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector bool short __a, vector short __b)
+{
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector short __a, vector bool short __b)
+{
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector bool short __b)
+{
+  return __a + (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector int __a, vector int __b)
+{
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector bool int __a, vector int __b)
+{
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector int __a, vector bool int __b)
+{
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector bool int __b)
+{
+  return __a + (vector unsigned int)__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_add(vector float __a, vector float __b)
+{
+  return __a + __b;
+}
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector signed char __b)
+{
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector bool char __b)
+{
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector bool char __b)
+{
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector short __a, vector short __b)
+{
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector short __b)
+{
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector short __a, vector bool short __b)
+{
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector bool short __b)
+{
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector int __a, vector int __b)
+{
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector int __b)
+{
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector int __a, vector bool int __b)
+{
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector bool int __b)
+{
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp  vec_vaddfp
+
+static vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b)
+{
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_addc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_vaddcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector signed char __b)
+{
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector bool char __b)
+{
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector bool char __b)
+{
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_and(vector bool char __a, vector bool char __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector short __a, vector short __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector short __a, vector bool short __b)
+{
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector bool short __b)
+{
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_and(vector bool short __a, vector bool short __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector int __a, vector int __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector int __a, vector bool int __b)
+{
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector bool int __b)
+{
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_and(vector bool int __a, vector bool int __b)
+{
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vand */
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector signed char __b)
+{
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector bool char __b)
+{
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector bool char __b)
+{
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector bool char __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector short __a, vector short __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector short __a, vector bool short __b)
+{
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector bool short __b)
+{
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector bool short __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector int __a, vector int __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector int __a, vector bool int __b)
+{
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector bool int __b)
+{
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector bool int __b)
+{
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector signed char __b)
+{
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector bool char __b)
+{
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector bool char __b)
+{
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector bool char __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector short __a, vector short __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector short __a, vector bool short __b)
+{
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector bool short __b)
+{
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector bool short __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector int __a, vector int __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector int __a, vector bool int __b)
+{
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector bool int __b)
+{
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector bool int __b)
+{
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vandc */
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector signed char __b)
+{
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector bool char __b)
+{
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector bool char __b)
+{
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector bool char __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector short __a, vector short __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector short __a, vector bool short __b)
+{
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector bool short __b)
+{
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector bool short __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector int __a, vector int __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector int __a, vector bool int __b)
+{
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector bool int __b)
+{
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector bool int __b)
+{
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_avg */
+
+static vector signed char __ATTRS_o_ai
+vec_avg(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_avg(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_avg(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_avg(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_avg(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_avg(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static vector float __attribute__((__always_inline__))
+vec_ceil(vector float __a)
+{
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_vrfip */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a)
+{
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpeq(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)
+    __builtin_altivec_vcmpequb((vector char)__a, (vector char)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)
+    __builtin_altivec_vcmpequb((vector char)__a, (vector char)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpeq(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)
+    __builtin_altivec_vcmpequh((vector short)__a, (vector short)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)
+    __builtin_altivec_vcmpequw((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+}
+
+/* vec_cmpge */
+
+static vector bool int __attribute__((__always_inline__))
+vec_cmpge(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgefp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_cmpgt */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpgt(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpgt(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static vector bool int __attribute__((__always_inline__))
+vec_cmple(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__b, __a);
+}
+
+/* vec_cmplt */
+
+static vector bool char __ATTRS_o_ai
+vec_cmplt(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmplt(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmplt(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmplt(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__b, __a);
+}
+
+/* vec_ctf */
+
+static vector float __ATTRS_o_ai
+vec_ctf(vector int __a, int __b)
+{
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ctf(vector unsigned int __a, int __b)
+{
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_vcfsx */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b)
+{
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b)
+{
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static vector int __attribute__((__always_inline__))
+vec_cts(vector float __a, int __b)
+{
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_vctsxs */
+
+static vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b)
+{
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_ctu(vector float __a, int __b)
+{
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_vctuxs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b)
+{
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_dss */
+
+static void __attribute__((__always_inline__))
+vec_dss(int __a)
+{
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static void __attribute__((__always_inline__))
+vec_dssall(void)
+{
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+
+static void __attribute__((__always_inline__))
+vec_dst(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dst(__a, __b, __c);
+}
+
+/* vec_dstst */
+
+static void __attribute__((__always_inline__))
+vec_dstst(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dstst(__a, __b, __c);
+}
+
+/* vec_dststt */
+
+static void __attribute__((__always_inline__))
+vec_dststt(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dststt(__a, __b, __c);
+}
+
+/* vec_dstt */
+
+static void __attribute__((__always_inline__))
+vec_dstt(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dstt(__a, __b, __c);
+}
+
+/* vec_expte */
+
+static vector float __attribute__((__always_inline__))
+vec_expte(vector float __a)
+{
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a)
+{
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static vector float __attribute__((__always_inline__))
+vec_floor(vector float __a)
+{
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_vrfim */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a)
+{
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static vector signed char __ATTRS_o_ai
+vec_ld(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_ld(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_ld(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ld(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ld(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_ld(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_ld(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ld(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ld(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_ld(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ld(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ld(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvx(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvx(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvx(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvx(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvx(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvx(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvx(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvx(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvx(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvx(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static vector signed char __ATTRS_o_ai
+vec_lde(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lde(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lde(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lde(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lde(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lde(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lde(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvebx(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvebx(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static vector short __ATTRS_o_ai
+vec_lvehx(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvehx(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static vector int __ATTRS_o_ai
+vec_lvewx(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvewx(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvewx(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_ldl(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ldl(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ldl(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_ldl(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_ldl(int __a, const vector pixel *__b)
+{
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ldl(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ldl(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_ldl(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ldl(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ldl(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvxl(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvxl(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvxl(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvxl(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvxl(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvxl(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvxl(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static vector float __attribute__((__always_inline__))
+vec_loge(vector float __a)
+{
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a)
+{
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const signed char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const float *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+/* vec_lvsr */
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const signed char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const float *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+/* vec_madd */
+
+static vector float __attribute__((__always_inline__))
+vec_madd(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_vmaddfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b, vector signed short __c)
+{
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a,
+              vector signed short __b,
+              vector signed short __c)
+{
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_max */
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_max(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vmaxfp(__a, __b);
+}
+
+/* vec_vmaxsb */
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vmaxfp(__a, __b);
+}
+
+/* vec_mergeh */
+
+static vector signed char __ATTRS_o_ai
+vec_mergeh(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_mergeh(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_mergeh(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector short __ATTRS_o_ai
+vec_mergeh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_mergeh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_mergeh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector int __ATTRS_o_ai
+vec_mergeh(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergeh(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_mergeh(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai
+vec_mergeh(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static vector signed char __ATTRS_o_ai
+vec_vmrghb(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmrghb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vmrghb(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static vector short __ATTRS_o_ai
+vec_vmrghh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vmrghh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vmrghh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static vector int __ATTRS_o_ai
+vec_vmrghw(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmrghw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vmrghw(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai
+vec_vmrghw(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static vector signed char __ATTRS_o_ai
+vec_mergel(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_mergel(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_mergel(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector short __ATTRS_o_ai
+vec_mergel(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_mergel(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_mergel(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector int __ATTRS_o_ai
+vec_mergel(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergel(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_mergel(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai
+vec_mergel(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static vector signed char __ATTRS_o_ai
+vec_vmrglb(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmrglb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vmrglb(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static vector short __ATTRS_o_ai
+vec_vmrglh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vmrglh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vmrglh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static vector int __ATTRS_o_ai
+vec_vmrglw(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmrglw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vmrglw(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai
+vec_vmrglw(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+/* vec_mfvscr */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void)
+{
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_min(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vminfp(__a, __b);
+}
+
+/* vec_vminsb */
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vminfp(__a, __b);
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector short __a, vector short __b, vector short __c)
+{
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector short __a, vector unsigned short __b, vector unsigned short __c)
+{
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a, vector short __b, vector short __c)
+{
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned short __c)
+{
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector short __a, vector short __b, vector short __c)
+{
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector short __a, vector unsigned short __b, vector unsigned short __c)
+{
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector short __b, vector short __c)
+{
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a,
+              vector unsigned short __b,
+              vector unsigned short __c)
+{
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c)
+{
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c)
+{
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static vector int __ATTRS_o_ai
+vec_msum(vector signed char __a, vector unsigned char __b, vector int __c)
+{
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned char __a, vector unsigned char __b, vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_msum(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c)
+{
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a,
+             vector unsigned char __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a,
+             vector unsigned short __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static vector int __ATTRS_o_ai
+vec_msums(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msums(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a,
+             vector unsigned short __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector signed char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector pixel __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector float __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static vector short __ATTRS_o_ai
+vec_mule(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mule(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_mule(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mule(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_vmulesb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static vector short __ATTRS_o_ai
+vec_mulo(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mulo(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_mulo(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mulo(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/* vec_vmulosb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/* vec_nmsub */
+
+static vector float __attribute__((__always_inline__))
+vec_nmsub(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_vnmsubfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static vector signed char __ATTRS_o_ai
+vec_nor(vector signed char __a, vector signed char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_nor(vector unsigned char __a, vector unsigned char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_nor(vector bool char __a, vector bool char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_nor(vector short __a, vector short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_nor(vector unsigned short __a, vector unsigned short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_nor(vector bool short __a, vector bool short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_nor(vector int __a, vector int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_nor(vector unsigned int __a, vector unsigned int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_nor(vector bool int __a, vector bool int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_nor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+/* vec_vnor */
+
+static vector signed char __ATTRS_o_ai
+vec_vnor(vector signed char __a, vector signed char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vnor(vector unsigned char __a, vector unsigned char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vnor(vector bool char __a, vector bool char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vnor(vector short __a, vector short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vnor(vector unsigned short __a, vector unsigned short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vnor(vector bool short __a, vector bool short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vnor(vector int __a, vector int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vnor(vector unsigned int __a, vector unsigned int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vnor(vector bool int __a, vector bool int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vnor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector signed char __b)
+{
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector bool char __b)
+{
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector bool char __b)
+{
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_or(vector bool char __a, vector bool char __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector short __a, vector short __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector bool short __a, vector short __b)
+{
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector short __a, vector bool short __b)
+{
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector bool short __b)
+{
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_or(vector bool short __a, vector bool short __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector int __a, vector int __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector bool int __a, vector int __b)
+{
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector int __a, vector bool int __b)
+{
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector bool int __b)
+{
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_or(vector bool int __a, vector bool int __b)
+{
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vor */
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector signed char __b)
+{
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector bool char __b)
+{
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector bool char __b)
+{
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector bool char __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector short __a, vector short __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector short __a, vector bool short __b)
+{
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector bool short __b)
+{
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector bool short __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector int __a, vector int __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector int __a, vector bool int __b)
+{
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector bool int __b)
+{
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector bool int __b)
+{
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static vector signed char __ATTRS_o_ai
+vec_pack(vector signed short __a, vector signed short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_pack(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_pack(vector bool short __a, vector bool short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_pack(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_pack(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_pack(vector bool int __a, vector bool int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static vector signed char __ATTRS_o_ai
+vec_vpkuhum(vector signed short __a, vector signed short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vpkuhum(vector bool short __a, vector bool short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static vector short __ATTRS_o_ai
+vec_vpkuwum(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkuwum(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vpkuwum(vector bool int __a, vector bool int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_packpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static vector signed char __ATTRS_o_ai
+vec_packs(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_packs(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector signed short __ATTRS_o_ai
+vec_packs(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packs(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpkshss */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpkuhus */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswss */
+
+static vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static vector unsigned char __ATTRS_o_ai
+vec_packsu(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_packsu(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packsu(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packsu(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpkshus */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+vector signed char __ATTRS_o_ai
+vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a,
+         vector unsigned char __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector short __ATTRS_o_ai
+vec_perm(vector short __a, vector short __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector bool short __ATTRS_o_ai
+vec_perm(vector bool short __a, vector bool short __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector pixel __ATTRS_o_ai
+vec_perm(vector pixel __a, vector pixel __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector pixel)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector int __ATTRS_o_ai
+vec_perm(vector int __a, vector int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned int)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool int)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector float __ATTRS_o_ai
+vec_perm(vector float __a, vector float __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector float)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector float)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+/* vec_vperm */
+
+static vector signed char __ATTRS_o_ai
+vec_vperm(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vperm(vector unsigned char __a,
+          vector unsigned char __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vperm(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vperm(vector short __a, vector short __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vperm(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vperm(vector bool short __a, vector bool short __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vperm(vector pixel __a, vector pixel __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vperm(vector int __a, vector int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vperm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vperm(vector bool int __a, vector bool int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_vperm(vector float __a, vector float __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+/* vec_re */
+
+static vector float __attribute__((__always_inline__))
+vec_re(vector float __a)
+{
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_vrefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a)
+{
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static vector signed char __ATTRS_o_ai
+vec_rl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_rl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_rl(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_rl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_rl(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_rl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_vrlb */
+
+static vector signed char __ATTRS_o_ai
+vec_vrlb(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vrlb(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static vector short __ATTRS_o_ai
+vec_vrlh(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vrlh(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static vector int __ATTRS_o_ai
+vec_vrlw(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vrlw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static vector float __attribute__((__always_inline__))
+vec_round(vector float __a)
+{
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_vrfin */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a)
+{
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_rsqrte */
+
+static __vector float __attribute__((__always_inline__))
+vec_rsqrte(vector float __a)
+{
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_vrsqrtefp */
+
+static __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a)
+{
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b, vector bool char __c)
+{
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_sel(vector short __a, vector short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai
+vec_sel(vector short __a, vector short __b, vector bool short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a,
+        vector unsigned short __b,
+        vector unsigned short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b, vector bool short __c)
+{
+  return (__a & ~(vector unsigned short)__c) | (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_sel(vector int __a, vector int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai
+vec_sel(vector int __a, vector int __b, vector bool int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c)
+{
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_sel(vector float __a, vector float __b, vector unsigned int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_sel(vector float __a, vector float __b, vector bool int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_vsel */
+
+static vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector bool char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b, vector bool char __c)
+{
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector bool char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector bool short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b, vector bool short __c)
+{
+  return (__a & ~(vector unsigned short)__c) | (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector bool short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsel(vector int __a, vector int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsel(vector int __a, vector int __b, vector bool int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsel(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsel(vector unsigned int __a, vector unsigned int __b, vector bool int __c)
+{
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector bool int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsel(vector float __a, vector float __b, vector unsigned int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vsel(vector float __a, vector float __b, vector bool int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b)
+{
+  return __a << (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sl(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a << __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sl(vector short __a, vector unsigned short __b)
+{
+  return __a << (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sl(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a << __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sl(vector int __a, vector unsigned int __b)
+{
+  return __a << (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sl(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a << __b;
+}
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static vector signed char __ATTRS_o_ai
+vec_vslb(vector signed char __a, vector unsigned char __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static vector short __ATTRS_o_ai
+vec_vslh(vector short __a, vector unsigned short __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static vector int __ATTRS_o_ai
+vec_vslw(vector int __a, vector unsigned int __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static vector signed char __ATTRS_o_ai
+vec_sld(vector signed char __a, vector signed char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sld(vector unsigned char __a, vector unsigned char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_sld(vector short __a, vector short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sld(vector unsigned short __a, vector unsigned short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sld(vector pixel __a, vector pixel __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_sld(vector int __a, vector int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int __a, vector unsigned int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector float __ATTRS_o_ai
+vec_sld(vector float __a, vector float __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+/* vec_vsldoi */
+
+static vector signed char __ATTRS_o_ai
+vec_vsldoi(vector signed char __a, vector signed char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsldoi(vector unsigned char __a, vector unsigned char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_vsldoi(vector short __a, vector short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsldoi(vector unsigned short __a, vector unsigned short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsldoi(vector pixel __a, vector pixel __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_vsldoi(vector int __a, vector int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsldoi(vector unsigned int __a, vector unsigned int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector float __ATTRS_o_ai
+vec_vsldoi(vector float __a, vector float __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+/* vec_sll */
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsl */
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+/* vec_slo */
+
+static vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_slo(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_slo(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_slo(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_slo(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_slo(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_slo(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_slo(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_slo(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_vslo */
+
+static vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vslo(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vslo(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vslo(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vslo(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vslo(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vslo(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vslo(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vslo(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static vector signed char __ATTRS_o_ai
+vec_splat(vector signed char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_splat(vector unsigned char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_splat(vector bool char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_splat(vector short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_splat(vector unsigned short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_splat(vector bool short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_splat(vector pixel __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector int __ATTRS_o_ai
+vec_splat(vector int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_splat(vector unsigned int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_splat(vector bool int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai
+vec_splat(vector float __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static vector signed char __ATTRS_o_ai
+vec_vspltb(vector signed char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vspltb(vector unsigned char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vspltb(vector bool char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static vector short __ATTRS_o_ai
+vec_vsplth(vector short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsplth(vector unsigned short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsplth(vector bool short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsplth(vector pixel __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static vector int __ATTRS_o_ai
+vec_vspltw(vector int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vspltw(vector unsigned int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vspltw(vector bool int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai
+vec_vspltw(vector float __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai
+vec_splat_s8(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai
+vec_vspltisb(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai
+vec_splat_s16(signed char __a)
+{
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai
+vec_vspltish(signed char __a)
+{
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai
+vec_splat_s32(signed char __a)
+{
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai
+vec_vspltisw(signed char __a)
+{
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned char __ATTRS_o_ai
+vec_splat_u8(unsigned char __a)
+{
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned short __ATTRS_o_ai
+vec_splat_u16(signed char __a)
+{
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned int __ATTRS_o_ai
+vec_splat_u32(signed char __a)
+{
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static vector signed char __ATTRS_o_ai
+vec_sr(vector signed char __a, vector unsigned char __b)
+{
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sr(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a >> __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sr(vector short __a, vector unsigned short __b)
+{
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sr(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a >> __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sr(vector int __a, vector unsigned int __b)
+{
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sr(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static vector signed char __ATTRS_o_ai
+vec_vsrb(vector signed char __a, vector unsigned char __b)
+{
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsrb(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static vector short __ATTRS_o_ai
+vec_vsrh(vector short __a, vector unsigned short __b)
+{
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsrh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static vector int __ATTRS_o_ai
+vec_vsrw(vector int __a, vector unsigned int __b)
+{
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsrw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static vector signed char __ATTRS_o_ai
+vec_sra(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sra(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sra(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sra(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sra(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sra(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_vsrab */
+
+static vector signed char __ATTRS_o_ai
+vec_vsrab(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsrab(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static vector short __ATTRS_o_ai
+vec_vsrah(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsrah(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static vector int __ATTRS_o_ai
+vec_vsraw(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsraw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsr */
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+/* vec_sro */
+
+static vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sro(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sro(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sro(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sro(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sro(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sro(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_sro(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_sro(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsro */
+
+static vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsro(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsro(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsro(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsro(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsro(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsro(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsro(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsro(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static void __ATTRS_o_ai
+vec_st(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static void __ATTRS_o_ai
+vec_stvx(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static void __ATTRS_o_ai
+vec_ste(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static void __ATTRS_o_ai
+vec_stvebx(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static void __ATTRS_o_ai
+vec_stvehx(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static void __ATTRS_o_ai
+vec_stvewx(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static void __ATTRS_o_ai
+vec_stl(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static void __ATTRS_o_ai
+vec_stvxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector signed char __b)
+{
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector bool char __b)
+{
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector bool char __b)
+{
+  return __a - (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector short __a, vector short __b)
+{
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector short __b)
+{
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector short __a, vector bool short __b)
+{
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector bool short __b)
+{
+  return __a - (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector int __a, vector int __b)
+{
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector int __b)
+{
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector int __a, vector bool int __b)
+{
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector bool int __b)
+{
+  return __a - (vector unsigned int)__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_sub(vector float __a, vector float __b)
+{
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector signed char __b)
+{
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector bool char __b)
+{
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector bool char __b)
+{
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector short __a, vector short __b)
+{
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector short __b)
+{
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector short __a, vector bool short __b)
+{
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector bool short __b)
+{
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector int __a, vector int __b)
+{
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector int __b)
+{
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector int __a, vector bool int __b)
+{
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector bool int __b)
+{
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b)
+{
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_subc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_vsubcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_sum4s */
+
+static vector int __ATTRS_o_ai
+vec_sum4s(vector signed char __a, vector int __b)
+{
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sum4s(vector unsigned char __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sum4s(vector signed short __a, vector int __b)
+{
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b)
+{
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b)
+{
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)
+    vec_perm(__c, __c, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)
+    vec_perm(__c, __c, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11));
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3));
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11));
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3));
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static vector float __attribute__((__always_inline__))
+vec_trunc(vector float __a)
+{
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_vrfiz */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a)
+{
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static vector short __ATTRS_o_ai
+vec_unpackh(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_unpackh(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_unpackh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_unpackh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_unpackh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsb */
+
+static vector short __ATTRS_o_ai
+vec_vupkhsb(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vupkhsb(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static vector int __ATTRS_o_ai
+vec_vupkhsh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vupkhsh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vupkhsh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_unpackl */
+
+static vector short __ATTRS_o_ai
+vec_unpackl(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_unpackl(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_unpackl(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_unpackl(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_unpackl(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsb */
+
+static vector short __ATTRS_o_ai
+vec_vupklsb(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vupklsb(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static vector int __ATTRS_o_ai
+vec_vupklsh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vupklsh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vupklsh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector signed char __b)
+{
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector bool char __b)
+{
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector bool char __b)
+{
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector bool char __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector short __a, vector short __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector short __a, vector bool short __b)
+{
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector bool short __b)
+{
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector bool short __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector int __a, vector int __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector int __a, vector bool int __b)
+{
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector bool int __b)
+{
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector bool int __b)
+{
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vxor */
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector signed char __b)
+{
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector bool char __b)
+{
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector bool char __b)
+{
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector bool char __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector short __a, vector short __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector short __a, vector bool short __b)
+{
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector bool short __b)
+{
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector bool short __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector int __a, vector int __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector int __a, vector bool int __b)
+{
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector bool int __b)
+{
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector bool int __b)
+{
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static signed char __ATTRS_o_ai
+vec_extract(vector signed char __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai
+vec_extract(vector unsigned char __a, int __b)
+{
+  return __a[__b];
+}
+
+static short __ATTRS_o_ai
+vec_extract(vector short __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai
+vec_extract(vector unsigned short __a, int __b)
+{
+  return __a[__b];
+}
+
+static int __ATTRS_o_ai
+vec_extract(vector int __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai
+vec_extract(vector unsigned int __a, int __b)
+{
+  return __a[__b];
+}
+
+static float __ATTRS_o_ai
+vec_extract(vector float __a, int __b)
+{
+  return __a[__b];
+}
+
+/* vec_insert */
+
+static vector signed char __ATTRS_o_ai
+vec_insert(signed char __a, vector signed char __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_insert(unsigned char __a, vector unsigned char __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_insert(short __a, vector short __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector unsigned short __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_insert(int __a, vector int __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_insert(unsigned int __a, vector unsigned int __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_insert(float __a, vector float __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const signed char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const vector signed char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlx(int __a, const short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlx(int __a, const vector short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvlx(int __a, const vector pixel *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlx(int __a, const int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlx(int __a, const vector int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlx(int __a, const float *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlx(int __a, const vector float *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const signed char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlxl(int __a, const short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvlxl(int __a, const vector pixel *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlxl(int __a, const int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlxl(int __a, const float *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlxl(int __a, vector float *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const vector signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool char *__b)
+{
+  return vec_perm((vector bool char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrx(int __a, const short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrx(int __a, const vector short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool short *__b)
+{
+  return vec_perm((vector bool short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvrx(int __a, const vector pixel *__b)
+{
+  return vec_perm((vector pixel)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrx(int __a, const int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrx(int __a, const vector int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool int *__b)
+{
+  return vec_perm((vector bool int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrx(int __a, const float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrx(int __a, const vector float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool char *__b)
+{
+  return vec_perm((vector bool char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrxl(int __a, const short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool short *__b)
+{
+  return vec_perm((vector bool short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvrxl(int __a, const vector pixel *__b)
+{
+  return vec_perm((vector pixel)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrxl(int __a, const int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool int *__b)
+{
+  return vec_perm((vector bool int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrxl(int __a, const float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrxl(int __a, const vector float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static void __ATTRS_o_ai
+vec_stvlx(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector short __a, int __b, short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector short __a, int __b, vector short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector int __a, int __b, int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector int __a, int __b, vector int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector float __a, int __b, vector float *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+/* vec_stvlxl */
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector short __a, int __b, short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector short __a, int __b, vector short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector int __a, int __b, int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector int __a, int __b, vector int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector float __a, int __b, vector float *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+/* vec_stvrx */
+
+static void __ATTRS_o_ai
+vec_stvrx(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector short __a, int __b, short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector short __a, int __b, vector short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector int __a, int __b, int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector int __a, int __b, vector int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector float __a, int __b, vector float *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+/* vec_stvrxl */
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector short __a, int __b, short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector short __a, int __b, vector short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector int __a, int __b, int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector int __a, int __b, vector int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector float __a, int __b, vector float *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+/* vec_promote */
+
+static vector signed char __ATTRS_o_ai
+vec_promote(signed char __a, int __b)
+{
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_promote(unsigned char __a, int __b)
+{
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector short __ATTRS_o_ai
+vec_promote(short __a, int __b)
+{
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_promote(unsigned short __a, int __b)
+{
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector int __ATTRS_o_ai
+vec_promote(int __a, int __b)
+{
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_promote(unsigned int __a, int __b)
+{
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector float __ATTRS_o_ai
+vec_promote(float __a, int __b)
+{
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static vector signed char __ATTRS_o_ai
+vec_splats(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_splats(unsigned char __a)
+{
+  return (vector unsigned char)(__a);
+}
+
+static vector short __ATTRS_o_ai
+vec_splats(short __a)
+{
+  return (vector short)(__a);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_splats(unsigned short __a)
+{
+  return (vector unsigned short)(__a);
+}
+
+static vector int __ATTRS_o_ai
+vec_splats(int __a)
+{
+  return (vector int)(__a);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_splats(unsigned int __a)
+{
+  return (vector unsigned int)(__a);
+}
+
+static vector float __ATTRS_o_ai
+vec_splats(float __a)
+{
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static int __ATTRS_o_ai
+vec_all_eq(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector pixel __a, vector pixel __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_ge */
+
+static int __ATTRS_o_ai
+vec_all_ge(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_gt */
+
+static int __ATTRS_o_ai
+vec_all_gt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_in */
+
+static int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+}
+
+/* vec_all_lt */
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+}
+
+/* vec_all_nan */
+
+static int __attribute__((__always_inline__))
+vec_all_nan(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+}
+
+/* vec_all_ne */
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector pixel __a, vector pixel __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_nge */
+
+static int __attribute__((__always_inline__))
+vec_all_nge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_ngt */
+
+static int __attribute__((__always_inline__))
+vec_all_ngt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_nle */
+
+static int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector pixel __a, vector pixel __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_ge */
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_gt */
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_le */
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+}
+
+/* vec_any_lt */
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+}
+
+/* vec_any_nan */
+
+static int __attribute__((__always_inline__))
+vec_any_nan(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector pixel __a, vector pixel __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nge */
+
+static int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */

diff --git a/21.1.2/clang-include/ammintrin.h b/21.1.2/clang-include/ammintrin.h
new file mode 100644
index 0000000..d87b9cd
--- /dev/null
+++ b/21.1.2/clang-include/ammintrin.h

@@ -0,0 +1,68 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#ifndef __SSE4A__
+#error "SSE4A instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#endif /* __SSE4A__ */
+
+#endif /* __AMMINTRIN_H */

diff --git a/21.1.2/clang-include/arm_acle.h b/21.1.2/clang-include/arm_acle.h
new file mode 100644
index 0000000..d706745
--- /dev/null
+++ b/21.1.2/clang-include/arm_acle.h

@@ -0,0 +1,151 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Miscellaneous data-processing intrinsics */
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __clz(uint32_t t) {
+  return __builtin_clz(t);
+}
+
+static __inline__ unsigned long __attribute__((always_inline, nodebug))
+  __clzl(unsigned long t) {
+  return __builtin_clzl(t);
+}
+
+static __inline__ uint64_t __attribute__((always_inline, nodebug))
+  __clzll(uint64_t t) {
+#if __SIZEOF_LONG_LONG__ == 8
+  return __builtin_clzll(t);
+#else
+  return __builtin_clzl(t);
+#endif
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __rev(uint32_t t) {
+  return __builtin_bswap32(t);
+}
+
+static __inline__ unsigned long __attribute__((always_inline, nodebug))
+  __revl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(t);
+#else
+  return __builtin_bswap64(t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((always_inline, nodebug))
+  __revll(uint64_t t) {
+  return __builtin_bswap64(t);
+}
+
+
+/*
+ * Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+
+static __inline__ int32_t __attribute__((always_inline, nodebug))
+  __qadd(int32_t t, int32_t v) {
+  return __builtin_arm_qadd(t, v);
+}
+
+static __inline__ int32_t __attribute__((always_inline, nodebug))
+  __qsub(int32_t t, int32_t v) {
+  return __builtin_arm_qsub(t, v);
+}
+
+static __inline__ int32_t __attribute__((always_inline, nodebug))
+__qdbl(int32_t t) {
+  return __builtin_arm_qadd(t, t);
+}
+#endif
+
+/* CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32b(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32b(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32h(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32h(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32w(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32w(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32d(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32d(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32cb(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32cb(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32ch(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32ch(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32cw(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32cw(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32cd(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32cd(a, b);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */

diff --git a/21.1.2/clang-include/avx2intrin.h b/21.1.2/clang-include/avx2intrin.h
new file mode 100644
index 0000000..394fdfe
--- /dev/null
+++ b/21.1.2/clang-include/avx2intrin.h

@@ -0,0 +1,1234 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a + (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a + (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a + (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
+  __m256i __a = (a); \
+  __m256i __b = (b); \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256d)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \
+                                   (((M) & 0x01) ? 16 : 0), \
+                                   (((M) & 0x02) ? 17 : 1), \
+                                   (((M) & 0x04) ? 18 : 2), \
+                                   (((M) & 0x08) ? 19 : 3), \
+                                   (((M) & 0x10) ? 20 : 4), \
+                                   (((M) & 0x20) ? 21 : 5), \
+                                   (((M) & 0x40) ? 22 : 6), \
+                                   (((M) & 0x80) ? 23 : 7), \
+                                   (((M) & 0x01) ? 24 : 8), \
+                                   (((M) & 0x02) ? 25 : 9), \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a == __b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a > (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a > __b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
+}
+
+static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a * (__v16hi)__b);
+}
+
+static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a * (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)_mm256_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) & 0x03) >> 0), \
+                                   12 + (((imm) & 0x0c) >> 2), \
+                                   12 + (((imm) & 0x30) >> 4), \
+                                   12 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+                                   (imm) & 0x3,((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) & 0x03) >> 0), \
+                                   8 + (((imm) & 0x0c) >> 2), \
+                                   8 + (((imm) & 0x30) >> 4), \
+                                   8 + (((imm) & 0xc0) >> 6), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, count) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, count) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a - (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a - (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a - (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_load_si256(__m256i *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__V);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_vbroadcastsi256(__X);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  __m128i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m128i)__builtin_shufflevector((__v4si)__V1, (__v4si)__V2, \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastq256(__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X);
+}
+
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastq128(__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  __m256d __V = (V); \
+  (__m256d)__builtin_shufflevector((__v4df)__V, (__v4df) _mm256_setzero_pd(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar8x32_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8sf)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  __m256i __V = (V); \
+  (__m256i)__builtin_shufflevector((__v4di)__V, (__v4di) _mm256_setzero_si256(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); })
+
+#define _mm256_extracti128_si256(A, O) __extension__ ({ \
+  __m256i __A = (A); \
+  (__m128i)__builtin_ia32_extract128i256(__A, (O)); })
+
+#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m128d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m128d __mask = (mask); \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)__a, (const __v2df *)__m, \
+             (__v4si)__i, (__v2df)__mask, (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m256d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m256d __mask = (mask); \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)__a, (const __v4df *)__m, \
+             (__v4si)__i, (__v4df)__mask, (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m128d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m128d __mask = (mask); \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)__a, (const __v2df *)__m, \
+             (__v2di)__i, (__v2df)__mask, (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m256d __a = (a); \
+  double const *__m = (m); \
+  __m256i __i = (i); \
+  __m256d __mask = (mask); \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)__a, (const __v4df *)__m, \
+             (__v4di)__i, (__v4df)__mask, (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)__a, (const __v4sf *)__m, \
+            (__v4si)__i, (__v4sf)__mask, (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m256 __a = (a); \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  __m256 __mask = (mask); \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)__a, (const __v8sf *)__m, \
+            (__v8si)__i, (__v8sf)__mask, (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)__a, (const __v4sf *)__m, \
+            (__v2di)__i, (__v4sf)__mask, (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)__a, (const __v4sf *)__m, \
+            (__v4di)__i, (__v4sf)__mask, (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)__a, (const __v4si *)__m, \
+            (__v4si)__i, (__v4si)__mask, (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)__a, (const __v8si *)__m, \
+            (__v8si)__i, (__v8si)__mask, (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)__a, (const __v4si *)__m, \
+            (__v2di)__i, (__v4si)__mask, (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)__a, (const __v4si *)__m, \
+            (__v4di)__i, (__v4si)__mask, (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)__a, (const __v2di *)__m, \
+             (__v4si)__i, (__v2di)__mask, (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)__a, (const __v4di *)__m, \
+             (__v4si)__i, (__v4di)__mask, (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)__a, (const __v2di *)__m, \
+             (__v2di)__i, (__v2di)__mask, (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  long long const *__m = (m); \
+  __m256i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)__a, (const __v4di *)__m, \
+             (__v4di)__i, (__v4di)__mask, (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_setzero_pd(), \
+             (const __v2df *)__m, (__v4si)__i, \
+             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_setzero_pd(), \
+             (const __v4df *)__m, (__v4si)__i, \
+             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_setzero_pd(), \
+             (const __v2df *)__m, (__v2di)__i, \
+             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_setzero_pd(), \
+             (const __v4df *)__m, (__v4di)__i, \
+             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v4si)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_setzero_ps(), \
+             (const __v8sf *)__m, (__v8si)__i, \
+             (__v8sf)_mm256_set1_ps((float)(int)-1), (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v2di)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v4di)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v4si)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_setzero_si256(), \
+            (const __v8si *)__m, (__v8si)__i, \
+            (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v2di)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v4di)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_setzero_si128(), \
+             (const __v2di *)__m, (__v4si)__i, \
+             (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_setzero_si256(), \
+             (const __v4di *)__m, (__v4si)__i, \
+             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_setzero_si128(), \
+             (const __v2di *)__m, (__v2di)__i, \
+             (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_setzero_si256(), \
+             (const __v4di *)__m, (__v4di)__i, \
+             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#endif /* __AVX2INTRIN_H */

diff --git a/21.1.2/clang-include/avxintrin.h b/21.1.2/clang-include/avxintrin.h
new file mode 100644
index 0000000..4e1044a
--- /dev/null
+++ b/21.1.2/clang-include/avxintrin.h

@@ -0,0 +1,1239 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Arithmetic */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    __m256d __V = (V); \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
+
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  __m256 __V = (V); \
+  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
+
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a | (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a | (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a ^ (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a ^ (__v8si)__b);
+}
+
+/* Horizontal arithmetic */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a,
+						  (__v8si)__c);
+}
+
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  __m128d __A = (A); \
+  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  __m256d __A = (A); \
+  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1, \
+                                   2 + (((C) & 0x4) >> 2), \
+                                   2 + (((C) & 0x8) >> 3)); })
+
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  __m128 __A = (A); \
+  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
+                                   (C) & 0x3, ((C) & 0xc) >> 2, \
+                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  __m256 __A = (A); \
+  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
+                                  (C) & 0x3, ((C) & 0xc) >> 2, \
+                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
+                                  4 + (((C) & 0x03) >> 0), \
+                                  4 + (((C) & 0x0c) >> 2), \
+                                  4 + (((C) & 0x30) >> 4), \
+                                  4 + (((C) & 0xc0) >> 6)); })
+
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m256d __V2 = (V2); \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
+
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
+
+/* Vector Blend */
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m256d __V2 = (V2); \
+  (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+
+/* Vector shuffle */
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+        __m256 __a = (a); \
+        __m256 __b = (b); \
+        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
+        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
+        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
+        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
+        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
+
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+        __m256d __a = (a); \
+        __m256d __b = (b); \
+        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
+        (mask) & 0x1, \
+        (((mask) & 0x2) >> 1) + 4, \
+        (((mask) & 0x4) >> 2) + 2, \
+        (((mask) & 0x8) >> 3) + 6); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
+
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
+
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  __m256d __a = (a); \
+  __m256d __b = (b); \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
+
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  __m256 __a = (a); \
+  __m256 __b = (b); \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
+
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
+
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
+
+/* Vector extract */
+#define _mm256_extractf128_pd(A, O) __extension__ ({ \
+  __m256d __A = (A); \
+  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
+
+#define _mm256_extractf128_ps(A, O) __extension__ ({ \
+  __m256 __A = (A); \
+  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
+
+#define _mm256_extractf128_si256(A, O) __extension__ ({ \
+  __m256i __A = (A); \
+  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi32(__m256i __a, int const __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi16(__m256i __a, int const __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return __b[__imm & 15];
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi8(__m256i __a, int const __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return __b[__imm & 31];
+}
+
+#ifdef __x86_64__
+static __inline long long  __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+/* Vector insert */
+#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m128d __V2 = (V2); \
+  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
+
+#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m128 __V2 = (V2); \
+  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
+
+#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi64(__m256i __a, int __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+/* Vector replicate */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
+}
+
+/* SIMD load ops */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  __builtin_ia32_storeupd256(__p, (__v4df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_storeups256(__p, (__v8sf)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
+}
+
+/* Conditional load ops */
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_pd(double const *__p, __m128d __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_pd(double const *__p, __m256d __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4df)__m);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_ps(float const *__p, __m128 __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_ps(float const *__p, __m256 __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
+}
+
+/* Conditional store ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+}
+
+/* Create vectors */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+	            float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+		             int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+		             short __w11, short __w10, short __w09, short __w08,
+		             short __w07, short __w06, short __w05, short __w04,
+		             short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+		            char __b27, char __b26, char __b25, char __b24,
+		            char __b23, char __b22, char __b21, char __b20,
+		            char __b19, char __b18, char __b17, char __b16,
+		            char __b15, char __b14, char __b13, char __b12,
+		            char __b11, char __b10, char __b09, char __b08,
+		            char __b07, char __b06, char __b05, char __b04,
+		            char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+		           float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+		              int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+		   short __w11, short __w10, short __w09, short __w08,
+		   short __w07, short __w06, short __w05, short __w04,
+		   short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+		             char __b27, char __b26, char __b25, char __b24,
+		             char __b23, char __b22, char __b21, char __b20,
+		             char __b19, char __b18, char __b17, char __b16,
+		             char __b15, char __b14, char __b13, char __b12,
+		             char __b11, char __b10, char __b09, char __b08,
+		             char __b07, char __b06, char __b05, char __b04,
+		             char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+		__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+		__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+		__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+/* SIMD load ops (unaligned) */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
+  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  
+  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
+  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((packed, may_alias));
+  __m256i __v256 = _mm256_castsi128_si256(
+    ((struct __loadu_si128*)__addr_lo)->__v);
+  return _mm256_insertf128_si256(__v256,
+                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
+}
+
+/* SIMD store ops (unaligned) */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  __builtin_ia32_storeups(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  __builtin_ia32_storeups(__addr_hi, __v128);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  __builtin_ia32_storeupd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  __builtin_ia32_storeupd(__addr_hi, __v128);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
+}
+
+#endif /* __AVXINTRIN_H */

diff --git a/21.1.2/clang-include/bmi2intrin.h b/21.1.2/clang-include/bmi2intrin.h
new file mode 100644
index 0000000..a05cfad
--- /dev/null
+++ b/21.1.2/clang-include/bmi2intrin.h

@@ -0,0 +1,94 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#endif /* __BMI2INTRIN_H */

diff --git a/21.1.2/clang-include/bmiintrin.h b/21.1.2/clang-include/bmiintrin.h
new file mode 100644
index 0000000..43c4a5e
--- /dev/null
+++ b/21.1.2/clang-include/bmiintrin.h

@@ -0,0 +1,148 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI__
+# error "BMI instruction set not enabled"
+#endif /* __BMI__ */
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+#define _blsr_u32(a)      (__blsr_u32((a)))
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u16(unsigned short __X)
+{
+  return __builtin_ctzs(__X);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u32(unsigned int __X)
+{
+  return __builtin_ctz(__X);
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+#define _blsr_u64(a)      (__blsr_u64((a)))
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u64(unsigned long long __X)
+{
+  return __builtin_ctzll(__X);
+}
+
+#endif /* __x86_64__ */
+
+#endif /* __BMIINTRIN_H */

diff --git a/21.1.2/clang-include/cpuid.h b/21.1.2/clang-include/cpuid.h
new file mode 100644
index 0000000..f9254e9
--- /dev/null
+++ b/21.1.2/clang-include/cpuid.h

@@ -0,0 +1,157 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE42       0x00100000
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_RDRAND      0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+/* PIC on i386 uses %ebx, so preserve it. */
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  pushl  %%ebx\n" \
+          "  cpuid\n" \
+          "  mov    %%ebx,%1\n" \
+          "  popl   %%ebx" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  pushl  %%ebx\n" \
+          "  cpuid\n" \
+          "  mov    %%ebx,%1\n" \
+          "  popl   %%ebx" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#else
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}

diff --git a/21.1.2/clang-include/emmintrin.h b/21.1.2/clang-include/emmintrin.h
new file mode 100644
index 0000000..b3f8569
--- /dev/null
+++ b/21.1.2/clang-include/emmintrin.h

@@ -0,0 +1,1451 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#ifndef __SSE2__
+#error "SSE2 instruction set not enabled"
+#else
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd(__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pd(__m128 __a)
+{
+  return __builtin_ia32_cvtps2pd(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2pd((__v4si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector(__u, __u, 1, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  struct __mm_store1_pd_struct {
+    double __u[2];
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
+  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  __builtin_ia32_storeupd(__dp, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a + (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a + (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a + (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a * (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a - (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a - (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a - (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return __a ^ __b;
+}
+
+#define _mm_slli_si128(a, count) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+   _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+
+#define _mm_srli_si128(a, count) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned. */
+  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64(__a);
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq(__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq(__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64x(long long q1, long long q0)
+{
+  return (__m128i){ q0, q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64(__m64 q1, __m64 q0)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi64(__m64 q0, __m64 q1)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi32(int i0, int i1, int i2, int i3)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntpd(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_ia32_movntdq(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_clflush(void const *__p)
+{
+  __builtin_ia32_clflush(__p);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_lfence(void)
+{
+  __builtin_ia32_lfence();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_mfence(void)
+{
+  __builtin_ia32_mfence();
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7); })
+
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd(__a);
+}
+
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  _Pragma("clang diagnostic pop"); \
+  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_pause(void)
+{
+  __asm__ volatile ("pause");
+}
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __SSE2__ */
+
+#endif /* __EMMINTRIN_H */

diff --git a/21.1.2/clang-include/f16cintrin.h b/21.1.2/clang-include/f16cintrin.h
new file mode 100644
index 0000000..f3614c0
--- /dev/null
+++ b/21.1.2/clang-include/f16cintrin.h

@@ -0,0 +1,58 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __F16C__
+# error "F16C instruction is not enabled"
+#endif /* __F16C__ */
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+  __m128 __a = (a); \
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__a, (imm)); })
+
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+  __m256 __a = (a); \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (imm)); })
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#endif /* __F16CINTRIN_H */

diff --git a/21.1.2/clang-include/float.h b/21.1.2/clang-include/float.h
new file mode 100644
index 0000000..02ef6bf
--- /dev/null
+++ b/21.1.2/clang-include/float.h

@@ -0,0 +1,124 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ */
+#if (defined(__MINGW32__) || defined(_MSC_VER)) && \
+    __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  undef DECIMAL_DIG
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#define DECIMAL_DIG __DECIMAL_DIG__
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+#endif /* __FLOAT_H */

diff --git a/21.1.2/clang-include/fma4intrin.h b/21.1.2/clang-include/fma4intrin.h
new file mode 100644
index 0000000..c30920d
--- /dev/null
+++ b/21.1.2/clang-include/fma4intrin.h

@@ -0,0 +1,231 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#ifndef __FMA4__
+# error "FMA4 instruction set is not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#endif /* __FMA4__ */
+
+#endif /* __FMA4INTRIN_H */

diff --git a/21.1.2/clang-include/fmaintrin.h b/21.1.2/clang-include/fmaintrin.h
new file mode 100644
index 0000000..6bfd5a8
--- /dev/null
+++ b/21.1.2/clang-include/fmaintrin.h

@@ -0,0 +1,229 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+#ifndef __FMA__
+# error "FMA instruction set is not enabled"
+#else
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#endif /* __FMA__ */
+
+#endif /* __FMAINTRIN_H */

diff --git a/21.1.2/clang-include/ia32intrin.h b/21.1.2/clang-include/ia32intrin.h
new file mode 100644
index 0000000..5adf3f1
--- /dev/null
+++ b/21.1.2/clang-include/ia32intrin.h

@@ -0,0 +1,101 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned long long __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popq %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __asm__ __volatile__ ("pushq %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned int __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popl %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __asm__ __volatile__ ("pushl %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtsc */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtsc(void) {
+  return __builtin_ia32_rdtsc();
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#endif /* __IA32INTRIN_H */

diff --git a/21.1.2/clang-include/immintrin.h b/21.1.2/clang-include/immintrin.h
new file mode 100644
index 0000000..df4bea8
--- /dev/null
+++ b/21.1.2/clang-include/immintrin.h

@@ -0,0 +1,118 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#ifdef __MMX__
+#include <mmintrin.h>
+#endif
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+
+#ifdef __SSSE3__
+#include <tmmintrin.h>
+#endif
+
+#if defined (__SSE4_2__) || defined (__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+#if defined (__AES__) || defined (__PCLMUL__)
+#include <wmmintrin.h>
+#endif
+
+#ifdef __AVX__
+#include <avxintrin.h>
+#endif
+
+#ifdef __AVX2__
+#include <avx2intrin.h>
+#endif
+
+#ifdef __BMI__
+#include <bmiintrin.h>
+#endif
+
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
+#ifdef __LZCNT__
+#include <lzcntintrin.h>
+#endif
+
+#ifdef __FMA__
+#include <fmaintrin.h>
+#endif
+
+#ifdef __RDRND__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+#ifdef __RTM__
+#include <rtmintrin.h>
+#endif
+
+/* FIXME: check __HLE__ as well when HLE is supported. */
+#if defined (__RTM__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_xtest(void)
+{
+  return __builtin_ia32_xtest();
+}
+#endif
+
+#ifdef __SHA__
+#include <shaintrin.h>
+#endif
+
+#endif /* __IMMINTRIN_H */

diff --git a/21.1.2/clang-include/iso646.h b/21.1.2/clang-include/iso646.h
new file mode 100644
index 0000000..dca13c5
--- /dev/null
+++ b/21.1.2/clang-include/iso646.h

@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */

diff --git a/21.1.2/clang-include/limits.h b/21.1.2/clang-include/limits.h
new file mode 100644
index 0000000..f04187c
--- /dev/null
+++ b/21.1.2/clang-include/limits.h

@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */

diff --git a/21.1.2/clang-include/lzcntintrin.h b/21.1.2/clang-include/lzcntintrin.h
new file mode 100644
index 0000000..62ab5ca
--- /dev/null
+++ b/21.1.2/clang-include/lzcntintrin.h

@@ -0,0 +1,55 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNT__
+# error "LZCNT instruction is not enabled"
+#endif /* __LZCNT__ */
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__lzcnt16(unsigned short __X)
+{
+  return __builtin_clzs(__X);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__lzcnt32(unsigned int __X)
+{
+  return __builtin_clz(__X);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__lzcnt64(unsigned long long __X)
+{
+  return __builtin_clzll(__X);
+}
+#endif
+
+#endif /* __LZCNTINTRIN_H */

diff --git a/21.1.2/clang-include/mm3dnow.h b/21.1.2/clang-include/mm3dnow.h
new file mode 100644
index 0000000..5242d99
--- /dev/null
+++ b/21.1.2/clang-include/mm3dnow.h

@@ -0,0 +1,162 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_femms() {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#endif

diff --git a/21.1.2/clang-include/mm_malloc.h b/21.1.2/clang-include/mm_malloc.h
new file mode 100644
index 0000000..305afd3
--- /dev/null
+++ b/21.1.2/clang-include/mm_malloc.h

@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */

diff --git a/21.1.2/clang-include/mmintrin.h b/21.1.2/clang-include/mmintrin.h
new file mode 100644
index 0000000..986870a
--- /dev/null
+++ b/21.1.2/clang-include/mmintrin.h

@@ -0,0 +1,503 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+#ifndef __MMX__
+#error "MMX instruction set not enabled"
+#else
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_pi16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq(__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);       
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMX__ */
+
+#endif /* __MMINTRIN_H */
+

diff --git a/21.1.2/clang-include/module.modulemap b/21.1.2/clang-include/module.modulemap
new file mode 100644
index 0000000..9f7944d
--- /dev/null
+++ b/21.1.2/clang-include/module.modulemap

@@ -0,0 +1,156 @@
+module _Builtin_intrinsics [system] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    header "x86intrin.h"
+
+    explicit module mm_malloc {
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      requires x86
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      requires mmx
+      header "mmintrin.h"
+    }
+
+    explicit module f16c {
+      requires f16c
+      header "f16cintrin.h"
+    }
+
+    explicit module sse {
+      requires sse
+      export mmx
+      export * // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      requires sse2
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      requires sse3
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      requires ssse3
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      requires sse41
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      requires sse42
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      requires sse4a
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module avx {
+      requires avx
+      export sse4_2
+      header "avxintrin.h"
+    }
+
+    explicit module avx2 {
+      requires avx2
+      export avx
+      header "avx2intrin.h"
+    }
+
+    explicit module bmi {
+      requires bmi
+      header "bmiintrin.h"
+    }
+
+    explicit module bmi2 {
+      requires bmi2
+      header "bmi2intrin.h"
+    }
+
+    explicit module fma {
+      requires fma
+      header "fmaintrin.h"
+    }
+
+    explicit module fma4 {
+      requires fma4
+      export sse3
+      header "fma4intrin.h"
+    }
+
+    explicit module lzcnt {
+      requires lzcnt
+      header "lzcntintrin.h"
+    }
+
+    explicit module popcnt {
+      requires popcnt
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      requires mm3dnow
+      header "mm3dnow.h"
+    }
+
+    explicit module xop {
+      requires xop
+      export fma4
+      header "xopintrin.h"
+    }
+
+    explicit module aes_pclmul {
+      requires aes, pclmul
+      header "wmmintrin.h"
+    }
+
+    explicit module aes {
+      requires aes
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      requires pclmul
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+}

diff --git a/21.1.2/clang-include/nmmintrin.h b/21.1.2/clang-include/nmmintrin.h
new file mode 100644
index 0000000..f12622d
--- /dev/null
+++ b/21.1.2/clang-include/nmmintrin.h

@@ -0,0 +1,35 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+#ifndef __SSE4_2__
+#error "SSE4.2 instruction set not enabled"
+#else
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* __SSE4_2__ */
+#endif /* _NMMINTRIN_H */

diff --git a/21.1.2/clang-include/pmmintrin.h b/21.1.2/clang-include/pmmintrin.h
new file mode 100644
index 0000000..6f1fc32
--- /dev/null
+++ b/21.1.2/clang-include/pmmintrin.h

@@ -0,0 +1,117 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#ifndef __SSE3__
+#error "SSE3 instruction set not enabled"
+#else
+
+#include <emmintrin.h>
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd(__a, __b);
+}
+
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#endif /* __SSE3__ */
+
+#endif /* __PMMINTRIN_H */

diff --git a/21.1.2/clang-include/popcntintrin.h b/21.1.2/clang-include/popcntintrin.h
new file mode 100644
index 0000000..d439daa
--- /dev/null
+++ b/21.1.2/clang-include/popcntintrin.h

@@ -0,0 +1,45 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __POPCNT__
+#error "POPCNT instruction set not enabled"
+#endif
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#endif /* _POPCNTINTRIN_H */

diff --git a/21.1.2/clang-include/prfchwintrin.h b/21.1.2/clang-include/prfchwintrin.h
new file mode 100644
index 0000000..9825bd8
--- /dev/null
+++ b/21.1.2/clang-include/prfchwintrin.h

@@ -0,0 +1,39 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */

diff --git a/21.1.2/clang-include/rdseedintrin.h b/21.1.2/clang-include/rdseedintrin.h
new file mode 100644
index 0000000..0fef1fa
--- /dev/null
+++ b/21.1.2/clang-include/rdseedintrin.h

@@ -0,0 +1,52 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+#ifdef __RDSEED__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+#endif /* __RDSEED__ */
+#endif /* __RDSEEDINTRIN_H */

diff --git a/21.1.2/clang-include/rtmintrin.h b/21.1.2/clang-include/rtmintrin.h
new file mode 100644
index 0000000..26149ca
--- /dev/null
+++ b/21.1.2/clang-include/rtmintrin.h

@@ -0,0 +1,54 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#endif /* __RTMINTRIN_H */

diff --git a/21.1.2/clang-include/shaintrin.h b/21.1.2/clang-include/shaintrin.h
new file mode 100644
index 0000000..66ed055
--- /dev/null
+++ b/21.1.2/clang-include/shaintrin.h

@@ -0,0 +1,74 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+#if !defined (__SHA__)
+#  error "SHA instructions not enabled"
+#endif
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((V1), (V2), (M)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha1nexte(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha1msg1(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha1msg2(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return __builtin_ia32_sha256rnds2(__X, __Y, __Z);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha256msg1(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha256msg2(__X, __Y);
+}
+
+#endif /* __SHAINTRIN_H */

diff --git a/21.1.2/clang-include/smmintrin.h b/21.1.2/clang-include/smmintrin.h
new file mode 100644
index 0000000..6e35734
--- /dev/null
+++ b/21.1.2/clang-include/smmintrin.h

@@ -0,0 +1,482 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#ifndef __SSE4_1__
+#error "SSE4.1 instruction set not enabled"
+#else
+
+#include <tmmintrin.h>
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  __m128 __X = (X); \
+  (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  __m128d __X = (X); \
+  (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  __m128d __V1 = (V1); \
+  __m128d __V2 = (V2); \
+  (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  __m128 __V1 = (V1); \
+  __m128 __V2 = (V2); \
+  (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  __m128i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_stream_load_si128 (__m128i *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+                                                    
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+                                           
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+                                             
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
+                                                   __a[(N) & 15] = (I);             \
+                                                   __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
+                                                    __a[(N) & 3] = (I);           \
+                                                    __a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                    __a[(N) & 1] = (I);           \
+                                                    __a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
+                                                 (int)(unsigned char) \
+                                                     __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
+                                                  __a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); 
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  __m128i __X = (X); \
+  __m128i __Y = (Y); \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+#ifdef __SSE4_2__
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
+#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
+     
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+     __builtin_ia32_pcmpistria128((A), (B), (M))
+#define _mm_cmpistrc(A, B, M) \
+     __builtin_ia32_pcmpistric128((A), (B), (M))
+#define _mm_cmpistro(A, B, M) \
+     __builtin_ia32_pcmpistrio128((A), (B), (M))
+#define _mm_cmpistrs(A, B, M) \
+     __builtin_ia32_pcmpistris128((A), (B), (M))
+#define _mm_cmpistrz(A, B, M) \
+     __builtin_ia32_pcmpistriz128((A), (B), (M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* __SSE4_2__ */
+#endif /* __SSE4_1__ */
+
+#endif /* _SMMINTRIN_H */

diff --git a/21.1.2/clang-include/stdalign.h b/21.1.2/clang-include/stdalign.h
new file mode 100644
index 0000000..3738d12
--- /dev/null
+++ b/21.1.2/clang-include/stdalign.h

@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */

diff --git a/21.1.2/clang-include/stdarg.h b/21.1.2/clang-include/stdarg.h
new file mode 100644
index 0000000..a57e183
--- /dev/null
+++ b/21.1.2/clang-include/stdarg.h

@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */

diff --git a/21.1.2/clang-include/stdbool.h b/21.1.2/clang-include/stdbool.h
new file mode 100644
index 0000000..0467893
--- /dev/null
+++ b/21.1.2/clang-include/stdbool.h

@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */

diff --git a/21.1.2/clang-include/stddef.h b/21.1.2/clang-include/stddef.h
new file mode 100644
index 0000000..2dfe0a2
--- /dev/null
+++ b/21.1.2/clang-include/stddef.h

@@ -0,0 +1,146 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+#define __STDDEF_H
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__STDDEF_H)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__STDDEF_H) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__STDDEF_H)
+
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#if !defined(__CLANG_MAX_ALIGN_T_DEFINED) || __has_feature(modules)
+#ifndef _MSC_VER
+typedef struct {
+  long long __clang_max_align_nonce1
+      __attribute__((__aligned__(__alignof__(long long))));
+  long double __clang_max_align_nonce2
+      __attribute__((__aligned__(__alignof__(long double))));
+} max_align_t;
+#else
+typedef double max_align_t;
+#endif
+#define __CLANG_MAX_ALIGN_T_DEFINED
+#endif
+#endif
+
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#endif  /* __STDDEF_H */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif

diff --git a/21.1.2/clang-include/stdint.h b/21.1.2/clang-include/stdint.h
new file mode 100644
index 0000000..2b1bc09
--- /dev/null
+++ b/21.1.2/clang-include/stdint.h

@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and 
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types. 
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).  
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef signed __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef unsigned __INT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef signed __INT56_TYPE__ int56_t;
+typedef unsigned __INT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef signed __INT48_TYPE__ int48_t;
+typedef unsigned __INT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef signed __INT40_TYPE__ int40_t;
+typedef unsigned __INT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef signed __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef unsigned __INT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef signed __INT24_TYPE__ int24_t;
+typedef unsigned __INT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef signed __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef unsigned __INT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef signed __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef unsigned __INT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined  
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined 
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are 
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types. 
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */

diff --git a/21.1.2/clang-include/stdnoreturn.h b/21.1.2/clang-include/stdnoreturn.h
new file mode 100644
index 0000000..a7a301d
--- /dev/null
+++ b/21.1.2/clang-include/stdnoreturn.h

@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */

diff --git a/21.1.2/clang-include/tbmintrin.h b/21.1.2/clang-include/tbmintrin.h
new file mode 100644
index 0000000..f95e34f
--- /dev/null
+++ b/21.1.2/clang-include/tbmintrin.h

@@ -0,0 +1,158 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TBM__
+#error "TBM instruction set is not enabled"
+#endif
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+#define __bextri_u32(a, b) (__builtin_ia32_bextri_u32((a), (b)))
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcfill_u32(unsigned int a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blci_u32(unsigned int a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcic_u32(unsigned int a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcmsk_u32(unsigned int a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcs_u32(unsigned int a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsfill_u32(unsigned int a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsic_u32(unsigned int a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__t1mskc_u32(unsigned int a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__tzmsk_u32(unsigned int a)
+{
+  return ~a & (a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) (__builtin_ia32_bextri_u64((a), (int)(b)))
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcfill_u64(unsigned long long a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blci_u64(unsigned long long a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcic_u64(unsigned long long a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcmsk_u64(unsigned long long a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcs_u64(unsigned long long a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blsfill_u64(unsigned long long a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blsic_u64(unsigned long long a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__t1mskc_u64(unsigned long long a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__tzmsk_u64(unsigned long long a)
+{
+  return ~a & (a - 1);
+}
+#endif
+
+#endif /* __TBMINTRIN_H */

diff --git a/21.1.2/clang-include/tgmath.h b/21.1.2/clang-include/tgmath.h
new file mode 100644
index 0000000..a48e267
--- /dev/null
+++ b/21.1.2/clang-include/tgmath.h

@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y) 
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */

diff --git a/21.1.2/clang-include/tmmintrin.h b/21.1.2/clang-include/tmmintrin.h
new file mode 100644
index 0000000..4238f5b
--- /dev/null
+++ b/21.1.2/clang-include/tmmintrin.h

@@ -0,0 +1,225 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#ifndef __SSSE3__
+#error "SSSE3 instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  __m128i __a = (a); \
+  __m128i __b = (b); \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); })
+
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  __m64 __a = (a); \
+  __m64 __b = (b); \
+  (__m64)__builtin_ia32_palignr((__v8qi)__a, (__v8qi)__b, (n)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#endif /* __SSSE3__ */
+
+#endif /* __TMMINTRIN_H */

diff --git a/21.1.2/clang-include/unwind.h b/21.1.2/clang-include/unwind.h
new file mode 100644
index 0000000..685c1df
--- /dev/null
+++ b/21.1.2/clang-include/unwind.h

@@ -0,0 +1,280 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) and libunwind provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((unavailable));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((unavailable));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((unavailable));
+void __register_frame_info(const void *, void *) __attribute__((unavailable));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((unavailable));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((unavailable));
+void __register_frame_table(const void *) __attribute__((unavailable));
+void __deregister_frame_info(const void *) __attribute__((unavailable));
+void __deregister_frame_info_bases(const void *)__attribute__((unavailable));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */

diff --git a/21.1.2/clang-include/varargs.h b/21.1.2/clang-include/varargs.h
new file mode 100644
index 0000000..b5477d0
--- /dev/null
+++ b/21.1.2/clang-include/varargs.h

@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif

diff --git a/21.1.2/clang-include/wmmintrin.h b/21.1.2/clang-include/wmmintrin.h
new file mode 100644
index 0000000..369e3c2
--- /dev/null
+++ b/21.1.2/clang-include/wmmintrin.h

@@ -0,0 +1,42 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#if !defined (__AES__) && !defined (__PCLMUL__)
+# error "AES/PCLMUL instructions not enabled"
+#else
+
+#ifdef __AES__
+#include <__wmmintrin_aes.h>
+#endif /* __AES__ */
+
+#ifdef __PCLMUL__
+#include <__wmmintrin_pclmul.h>
+#endif /* __PCLMUL__ */
+
+#endif /* __AES__ || __PCLMUL__ */
+#endif /* _WMMINTRIN_H */

diff --git a/21.1.2/clang-include/x86intrin.h b/21.1.2/clang-include/x86intrin.h
new file mode 100644
index 0000000..21a43da
--- /dev/null
+++ b/21.1.2/clang-include/x86intrin.h

@@ -0,0 +1,81 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#ifdef __3dNOW__
+#include <mm3dnow.h>
+#endif
+
+#ifdef __BMI__
+#include <bmiintrin.h>
+#endif
+
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
+#ifdef __LZCNT__
+#include <lzcntintrin.h>
+#endif
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#ifdef __RDSEED__
+#include <rdseedintrin.h>
+#endif
+
+#ifdef __PRFCHW__
+#include <prfchwintrin.h>
+#endif
+
+#ifdef __SSE4A__
+#include <ammintrin.h>
+#endif
+
+#ifdef __FMA4__
+#include <fma4intrin.h>
+#endif
+
+#ifdef __XOP__
+#include <xopintrin.h>
+#endif
+
+#ifdef __TBM__
+#include <tbmintrin.h>
+#endif
+
+#ifdef __F16C__
+#include <f16cintrin.h>
+#endif
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */

diff --git a/21.1.2/clang-include/xmmintrin.h b/21.1.2/clang-include/xmmintrin.h
new file mode 100644
index 0000000..c9befcb
--- /dev/null
+++ b/21.1.2/clang-include/xmmintrin.h

@@ -0,0 +1,1003 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+ 
+#ifndef __SSE__
+#error "SSE instruction set not enabled"
+#else
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 1);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 1);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 2);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 2);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 1),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 1);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 2),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 2);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 4);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 4);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 6);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 6);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 5),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 6),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 6);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 3);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64(__a);
+}
+
+#endif
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si64(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_storeups(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
+  _mm_storeu_ps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps1(float *__p, __m128 __a)
+{
+    return _mm_store1_ps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128 *)__p = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_sfence(void)
+{
+  __builtin_ia32_sfence();
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_pi16(__m64 __a, int __n)
+{
+  __v4hi __b = (__v4hi)__a;
+  return (unsigned short)__b[__n & 3];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_insert_pi16(__m64 __a, int __d, int __n)
+{
+   __v4hi __b = (__v4hi)__a;
+   __b[__n & 3] = __d;
+   return (__m64)__b;
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  __m64 __a = (a); \
+  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_getcsr(void)
+{
+  return __builtin_ia32_stmxcsr();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_setcsr(unsigned int __i)
+{
+  __builtin_ia32_ldmxcsr(__i);
+}
+
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
+                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
+                                  (((mask) & 0x30) >> 4) + 4, \
+                                  (((mask) & 0xc0) >> 6) + 4); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+  
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+  
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+  
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+  
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+  
+  return _mm_packs_pi32(__b, __c);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+  
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+  
+  return _mm_packs_pi16(__b, __c);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps(__a);
+}
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#endif /* __SSE__ */
+
+#endif /* __XMMINTRIN_H */

diff --git a/21.1.2/clang-include/xopintrin.h b/21.1.2/clang-include/xopintrin.h
new file mode 100644
index 0000000..cc94ca0
--- /dev/null
+++ b/21.1.2/clang-include/xopintrin.h

@@ -0,0 +1,804 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#ifndef __XOP__
+# error "XOP instruction set is not enabled"
+#else
+
+#include <fma4intrin.h>
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  __m128i __C = (C); \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
+                                     (__v2di)__C, (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  __m256d __X = (X); \
+  __m256d __Y = (Y); \
+  __m256i __C = (C); \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
+                                        (__v4di)__C, (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  __m128i __C = (C); \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
+                                    (__v4si)__C, (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  __m256 __X = (X); \
+  __m256 __Y = (Y); \
+  __m256i __C = (C); \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
+                                       (__v8si)__C, (I)); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#endif /* __XOP__ */
+
+#endif /* __XOPINTRIN_H */

diff --git a/21.1.2/include/rs_allocation.rsh b/21.1.2/include/rs_allocation.rsh
new file mode 100644
index 0000000..6f3f8d9
--- /dev/null
+++ b/21.1.2/include/rs_allocation.rsh

@@ -0,0 +1,431 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_allocation.rsh
+ *  \brief Allocation routines
+ *
+ *
+ */
+
+#ifndef __RS_ALLOCATION_RSH__
+#define __RS_ALLOCATION_RSH__
+
+/**
+ * Returns the Allocation for a given pointer.  The pointer should point within
+ * a valid allocation.  The results are undefined if the pointer is not from a
+ * valid allocation.
+ *
+ * This function is deprecated and will be removed in the SDK from a future
+ * release.
+ */
+extern rs_allocation __attribute__((overloadable))
+    rsGetAllocation(const void *);
+
+/**
+ * Query the dimension of an allocation.
+ *
+ * @return uint32_t The X dimension of the allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimX(rs_allocation);
+
+/**
+ * Query the dimension of an allocation.
+ *
+ * @return uint32_t The Y dimension of the allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimY(rs_allocation);
+
+/**
+ * Query the dimension of an allocation.
+ *
+ * @return uint32_t The Z dimension of the allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimZ(rs_allocation);
+
+/**
+ * Query an allocation for the presence of more than one LOD.
+ *
+ * @return uint32_t Returns 1 if more than one LOD is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimLOD(rs_allocation);
+
+/**
+ * Query an allocation for the presence of more than one face.
+ *
+ * @return uint32_t Returns 1 if more than one face is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimFaces(rs_allocation);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * Copy part of an allocation from another allocation.
+ *
+ * @param dstAlloc Allocation to copy data into.
+ * @param dstOff The offset of the first element to be copied in
+ *               the destination allocation.
+ * @param dstMip Mip level in the destination allocation.
+ * @param count The number of elements to be copied.
+ * @param srcAlloc The source data allocation.
+ * @param srcOff The offset of the first element in data to be
+ *               copied in the source allocation.
+ * @param srcMip Mip level in the source allocation.
+ */
+extern void __attribute__((overloadable))
+    rsAllocationCopy1DRange(rs_allocation dstAlloc,
+                            uint32_t dstOff, uint32_t dstMip,
+                            uint32_t count,
+                            rs_allocation srcAlloc,
+                            uint32_t srcOff, uint32_t srcMip);
+
+/**
+ * Copy a rectangular region into the allocation from another
+ * allocation.
+ *
+ * @param dstAlloc allocation to copy data into.
+ * @param dstXoff X offset of the region to update in the
+ *                destination allocation.
+ * @param dstYoff Y offset of the region to update in the
+ *                destination allocation.
+ * @param dstMip Mip level in the destination allocation.
+ * @param dstFace Cubemap face of the destination allocation,
+ *                ignored for allocations that aren't cubemaps.
+ * @param width Width of the incoming region to update.
+ * @param height Height of the incoming region to update.
+ * @param srcAlloc The source data allocation.
+ * @param srcXoff X offset in data of the source allocation.
+ * @param srcYoff Y offset in data of the source allocation.
+ * @param srcMip Mip level in the source allocation.
+ * @param srcFace Cubemap face of the source allocation,
+ *                ignored for allocations that aren't cubemaps.
+ */
+extern void __attribute__((overloadable))
+    rsAllocationCopy2DRange(rs_allocation dstAlloc,
+                            uint32_t dstXoff, uint32_t dstYoff,
+                            uint32_t dstMip,
+                            rs_allocation_cubemap_face dstFace,
+                            uint32_t width, uint32_t height,
+                            rs_allocation srcAlloc,
+                            uint32_t srcXoff, uint32_t srcYoff,
+                            uint32_t srcMip,
+                            rs_allocation_cubemap_face srcFace);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+/**
+ * Extract a single element from an allocation.
+ */
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x);
+/**
+ * \overload
+ */
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y);
+/**
+ * \overload
+ */
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+    #define GET_ELEMENT_AT(T) \
+    extern T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x); \
+    extern T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y);  \
+    extern T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#else
+    #define GET_ELEMENT_AT(T) \
+    static inline T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x) {  \
+        return ((T *)rsGetElementAt(a, x))[0]; \
+    } \
+    static inline T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y) {  \
+        return ((T *)rsGetElementAt(a, x, y))[0]; \
+    } \
+    static inline T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {  \
+        return ((T *)rsGetElementAt(a, x, y, z))[0]; \
+    }
+#endif
+
+GET_ELEMENT_AT(char)
+GET_ELEMENT_AT(char2)
+GET_ELEMENT_AT(char3)
+GET_ELEMENT_AT(char4)
+GET_ELEMENT_AT(uchar)
+GET_ELEMENT_AT(uchar2)
+GET_ELEMENT_AT(uchar3)
+GET_ELEMENT_AT(uchar4)
+GET_ELEMENT_AT(short)
+GET_ELEMENT_AT(short2)
+GET_ELEMENT_AT(short3)
+GET_ELEMENT_AT(short4)
+GET_ELEMENT_AT(ushort)
+GET_ELEMENT_AT(ushort2)
+GET_ELEMENT_AT(ushort3)
+GET_ELEMENT_AT(ushort4)
+GET_ELEMENT_AT(int)
+GET_ELEMENT_AT(int2)
+GET_ELEMENT_AT(int3)
+GET_ELEMENT_AT(int4)
+GET_ELEMENT_AT(uint)
+GET_ELEMENT_AT(uint2)
+GET_ELEMENT_AT(uint3)
+GET_ELEMENT_AT(uint4)
+GET_ELEMENT_AT(long)
+GET_ELEMENT_AT(long2)
+GET_ELEMENT_AT(long3)
+GET_ELEMENT_AT(long4)
+GET_ELEMENT_AT(ulong)
+GET_ELEMENT_AT(ulong2)
+GET_ELEMENT_AT(ulong3)
+GET_ELEMENT_AT(ulong4)
+GET_ELEMENT_AT(float)
+GET_ELEMENT_AT(float2)
+GET_ELEMENT_AT(float3)
+GET_ELEMENT_AT(float4)
+GET_ELEMENT_AT(double)
+GET_ELEMENT_AT(double2)
+GET_ELEMENT_AT(double3)
+GET_ELEMENT_AT(double4)
+
+#undef GET_ELEMENT_AT
+
+// Jelly Bean
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Send the contents of the Allocation to the queue.
+ * @param a allocation to work on
+ */
+extern const void __attribute__((overloadable))
+    rsAllocationIoSend(rs_allocation a);
+
+/**
+ * Receive a new set of contents from the queue.
+ * @param a allocation to work on
+ */
+extern const void __attribute__((overloadable))
+    rsAllocationIoReceive(rs_allocation a);
+
+
+/**
+ * Get the element object describing the allocation's layout
+ * @param a allocation to get data from
+ * @return element describing allocation layout
+ */
+extern rs_element __attribute__((overloadable))
+    rsAllocationGetElement(rs_allocation a);
+
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 1D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location);
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 1D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ * @param lod mip level to sample from, for fractional values
+ *            mip levels will be interpolated if
+ *            RS_SAMPLER_LINEAR_MIP_LINEAR is used
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location, float lod);
+
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 2D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location);
+
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 2D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ * @param lod mip level to sample from, for fractional values
+ *            mip levels will be interpolated if
+ *            RS_SAMPLER_LINEAR_MIP_LINEAR is used
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location, float lod);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+
+/**
+ * Set single element of an allocation.
+ */
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x);
+
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y);
+
+#define SET_ELEMENT_AT(T)                                               \
+    extern void __attribute__((overloadable))                           \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x);             \
+    extern void __attribute__((overloadable))                           \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y); \
+    extern void __attribute__((overloadable))                           \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z);
+
+
+SET_ELEMENT_AT(char)
+SET_ELEMENT_AT(char2)
+SET_ELEMENT_AT(char3)
+SET_ELEMENT_AT(char4)
+SET_ELEMENT_AT(uchar)
+SET_ELEMENT_AT(uchar2)
+SET_ELEMENT_AT(uchar3)
+SET_ELEMENT_AT(uchar4)
+SET_ELEMENT_AT(short)
+SET_ELEMENT_AT(short2)
+SET_ELEMENT_AT(short3)
+SET_ELEMENT_AT(short4)
+SET_ELEMENT_AT(ushort)
+SET_ELEMENT_AT(ushort2)
+SET_ELEMENT_AT(ushort3)
+SET_ELEMENT_AT(ushort4)
+SET_ELEMENT_AT(int)
+SET_ELEMENT_AT(int2)
+SET_ELEMENT_AT(int3)
+SET_ELEMENT_AT(int4)
+SET_ELEMENT_AT(uint)
+SET_ELEMENT_AT(uint2)
+SET_ELEMENT_AT(uint3)
+SET_ELEMENT_AT(uint4)
+SET_ELEMENT_AT(long)
+SET_ELEMENT_AT(long2)
+SET_ELEMENT_AT(long3)
+SET_ELEMENT_AT(long4)
+SET_ELEMENT_AT(ulong)
+SET_ELEMENT_AT(ulong2)
+SET_ELEMENT_AT(ulong3)
+SET_ELEMENT_AT(ulong4)
+SET_ELEMENT_AT(float)
+SET_ELEMENT_AT(float2)
+SET_ELEMENT_AT(float3)
+SET_ELEMENT_AT(float4)
+SET_ELEMENT_AT(double)
+SET_ELEMENT_AT(double2)
+SET_ELEMENT_AT(double3)
+SET_ELEMENT_AT(double4)
+
+#undef SET_ELEMENT_AT
+
+
+/**
+ * Extract a single element from an allocation.
+ */
+extern const uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_Y(rs_allocation a, uint32_t x, uint32_t y);
+
+/**
+ * Extract a single element from an allocation.
+ *
+ * Coordinates are in the dimensions of the Y plane
+ */
+extern const uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y);
+
+/**
+ * Extract a single element from an allocation.
+ *
+ * Coordinates are in the dimensions of the Y plane
+ */
+extern const uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 18))
+
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 999))
+
+#define VOP(T)                                                                   \
+    extern T __attribute__((overloadable))                                       \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x);                         \
+    extern T __attribute__((overloadable))                                       \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y);             \
+    extern T __attribute__((overloadable))                                       \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z); \
+    extern void __attribute__((overloadable))                                    \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x);                 \
+    extern void __attribute__((overloadable))                                    \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y);     \
+    extern void __attribute__((overloadable))                                    \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z);
+
+VOP(char2)
+VOP(char3)
+VOP(char4)
+VOP(uchar2)
+VOP(uchar3)
+VOP(uchar4)
+VOP(short2)
+VOP(short3)
+VOP(short4)
+VOP(ushort2)
+VOP(ushort3)
+VOP(ushort4)
+VOP(int2)
+VOP(int3)
+VOP(int4)
+VOP(uint2)
+VOP(uint3)
+VOP(uint4)
+VOP(long2)
+VOP(long3)
+VOP(long4)
+VOP(ulong2)
+VOP(ulong3)
+VOP(ulong4)
+VOP(float2)
+VOP(float3)
+VOP(float4)
+VOP(double2)
+VOP(double3)
+VOP(double4)
+
+#undef VOP
+
+#endif //(defined(RS_VERSION) && (RS_VERSION >= 999))
+
+
+#endif
+

diff --git a/21.1.2/include/rs_atomic.rsh b/21.1.2/include/rs_atomic.rsh
new file mode 100644
index 0000000..ba847cf
--- /dev/null
+++ b/21.1.2/include/rs_atomic.rsh

@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_atomic.rsh
+ *  \brief Atomic routines
+ *
+ *
+ */
+
+#ifndef __RS_ATOMIC_RSH__
+#define __RS_ATOMIC_RSH__
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * Atomic add one to the value at addr.
+ * Equal to rsAtomicAdd(addr, 1)
+ *
+ * @param addr Address of value to increment
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile int32_t* addr);
+
+/**
+ * Atomic subtract one from the value at addr. Equal to rsAtomicSub(addr, 1)
+ *
+ * @param addr Address of value to decrement
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile int32_t* addr);
+
+/**
+ * Atomic add a value to the value at addr.  addr[0] += value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to add to the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Subtract a value from the value at addr.  addr[0] -= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to subtract from the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Bitwise and a value from the value at addr.  addr[0] &= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to and with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Bitwise or a value from the value at addr.  addr[0] |= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to or with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Bitwise xor a value from the value at addr.  addr[0] ^= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to xor with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Set the value at addr to the min of addr and value
+ * addr[0] = rsMin(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMin(volatile uint32_t* addr, uint32_t value);
+/**
+ * Atomic Set the value at addr to the min of addr and value
+ * addr[0] = rsMin(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicMin(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Set the value at addr to the max of addr and value
+ * addr[0] = rsMax(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMax(volatile uint32_t* addr, uint32_t value);
+/**
+ * Atomic Set the value at addr to the max of addr and value
+ * addr[0] = rsMin(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicMax(volatile int32_t* addr, int32_t value);
+
+/**
+ * Compare-and-set operation with a full memory barrier.
+ *
+ * If the value at addr matches compareValue then newValue is written.
+ *
+ * @param addr The address to compare and replace if the compare passes.
+ * @param compareValue The value to test addr[0] against.
+ * @param newValue The value to write if the test passes.
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicCas(volatile int32_t* addr, int32_t compareValue, int32_t newValue);
+
+/**
+ * Compare-and-set operation with a full memory barrier.
+ *
+ * If the value at addr matches compareValue then newValue is written.
+ *
+ * @param addr The address to compare and replace if the compare passes.
+ * @param compareValue The value to test addr[0] against.
+ * @param newValue The value to write if the test passes.
+ *
+ * @return old value
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAtomicCas(volatile uint32_t* addr, uint32_t compareValue, uint32_t newValue);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))   // TODO: api 21
+
+/**
+ * Atomic add one to the value at addr.
+ * Equal to rsAtomicAdd(addr, 1)
+ *
+ * @param addr Address of value to increment
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile uint32_t* addr);
+
+/**
+ * Atomic subtract one from the value at addr. Equal to rsAtomicSub(addr, 1)
+ *
+ * @param addr Address of value to decrement
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile uint32_t* addr);
+
+/**
+ * Atomic add a value to the value at addr.  addr[0] += value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to add to the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Subtract a value from the value at addr.  addr[0] -= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to subtract from the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Bitwise and a value from the value at addr.  addr[0] &= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to and with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Bitwise or a value from the value at addr.  addr[0] |= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to or with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Bitwise xor a value from the value at addr.  addr[0] ^= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to xor with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile uint32_t* addr, uint32_t value);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 21)
+
+#endif
+

diff --git a/21.1.2/include/rs_core.rsh b/21.1.2/include/rs_core.rsh
new file mode 100644
index 0000000..3489e44
--- /dev/null
+++ b/21.1.2/include/rs_core.rsh

@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*! \mainpage notitle
+  *
+  * RenderScript is a high-performance runtime that provides
+  * compute operations at the native level. RenderScript code is compiled on devices
+  * at runtime to allow platform-independence as well.
+  * This reference documentation describes the RenderScript runtime APIs, which you
+  * can utilize to write RenderScript code in C99. The RenderScript compute header
+  * files are automatically included for you.
+  *
+  * To use RenderScript, you need to utilize the RenderScript runtime APIs documented here
+  * as well as the Android framework APIs for RenderScript.
+  * For documentation on the Android framework APIs, see the <a target="_parent" href=
+  * "http://developer.android.com/reference/android/renderscript/package-summary.html">
+  * android.renderscript</a> package reference.
+  * For more information on how to develop with RenderScript and how the runtime and
+  * Android framework APIs interact, see the <a target="_parent" href=
+  * "http://developer.android.com/guide/topics/renderscript/index.html">RenderScript
+  * developer guide</a> and the <a target="_parent" href=
+  * "http://developer.android.com/resources/samples/RenderScript/index.html">
+  * RenderScript samples</a>.
+  */
+
+/** @file rs_core.rsh
+ *  \brief todo-jsams
+ *
+ *  todo-jsams
+ *
+ */
+
+#ifndef __RS_CORE_RSH__
+#define __RS_CORE_RSH__
+
+#define _RS_RUNTIME extern
+
+#define RS_KERNEL __attribute__((kernel))
+
+#include "rs_types.rsh"
+#include "rs_allocation.rsh"
+#include "rs_atomic.rsh"
+#include "rs_core_math.rsh"
+#include "rs_debug.rsh"
+#include "rs_element.rsh"
+#include "rs_math.rsh"
+#include "rs_matrix.rsh"
+#include "rs_object.rsh"
+#include "rs_quaternion.rsh"
+#include "rs_sampler.rsh"
+#include "rs_time.rsh"
+
+/**
+ * Send a message back to the client.  Will not block and returns true
+ * if the message was sendable and false if the fifo was full.
+ * A message ID is required.  Data payload is optional.
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID, const void *data, uint len);
+/**
+ * Send a message back to the client, blocking until the message is queued.
+ * A message ID is required.  Data payload is optional.
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID, const void *data, uint len);
+
+
+/**
+ * Launch order hint for rsForEach calls.  This provides a hint to the system to
+ * determine in which order the root function of the target is called with each
+ * cell of the allocation.
+ *
+ * This is a hint and implementations may not obey the order.
+ */
+enum rs_for_each_strategy {
+    RS_FOR_EACH_STRATEGY_SERIAL = 0,
+    RS_FOR_EACH_STRATEGY_DONT_CARE = 1,
+    RS_FOR_EACH_STRATEGY_DST_LINEAR = 2,
+    RS_FOR_EACH_STRATEGY_TILE_SMALL= 3,
+    RS_FOR_EACH_STRATEGY_TILE_MEDIUM = 4,
+    RS_FOR_EACH_STRATEGY_TILE_LARGE = 5
+};
+
+
+/**
+ * Structure to provide extra information to a rsForEach call.  Primarly used to
+ * restrict the call to a subset of cells in the allocation.
+ */
+typedef struct rs_script_call {
+    enum rs_for_each_strategy strategy;
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+} rs_script_call_t;
+
+/**
+ * Make a script to script call to launch work. One of the input or output is
+ * required to be a valid object. The input and output must be of the same
+ * dimensions.
+ * API 10-13
+ *
+ * @param script The target script to call
+ * @param input The allocation to source data from
+ * @param output the allocation to write date into
+ * @param usrData The user definied params to pass to the root script.  May be
+ *                NULL.
+ * @param sc Extra control infomation used to select a sub-region of the
+ *           allocation to be processed or suggest a walking strategy.  May be
+ *           NULL.
+ *
+ *  */
+#if !defined(RS_VERSION) || (RS_VERSION < 14)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input,
+              rs_allocation output, const void * usrData,
+              const rs_script_call_t *sc);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input,
+              rs_allocation output, const void * usrData);
+#else
+
+/**
+ * Make a script to script call to launch work. One of the input or output is
+ * required to be a valid object. The input and output must be of the same
+ * dimensions.
+ * API 14+
+ *
+ * @param script The target script to call
+ * @param input The allocation to source data from
+ * @param output the allocation to write date into
+ * @param usrData The user definied params to pass to the root script.  May be
+ *                NULL.
+ * @param usrDataLen The size of the userData structure.  This will be used to
+ *                   perform a shallow copy of the data if necessary.
+ * @param sc Extra control infomation used to select a sub-region of the
+ *           allocation to be processed or suggest a walking strategy.  May be
+ *           NULL.
+ *
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output,
+              const void * usrData, size_t usrDataLen, const rs_script_call_t *);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output,
+              const void * usrData, size_t usrDataLen);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output);
+#endif
+
+
+
+#undef _RS_RUNTIME
+
+#endif

diff --git a/21.1.2/include/rs_core_math.rsh b/21.1.2/include/rs_core_math.rsh
new file mode 100644
index 0000000..287a1b9
--- /dev/null
+++ b/21.1.2/include/rs_core_math.rsh

@@ -0,0 +1,9888 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/gen_runtime.
+
+#ifndef __rs_core_math_rsh__
+#define __rs_core_math_rsh__
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar __attribute__((const, overloadable))abs(char value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))abs(char2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))abs(char3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))abs(char4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort __attribute__((const, overloadable))abs(short value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))abs(short2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))abs(short3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))abs(short4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint __attribute__((const, overloadable))abs(int value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))abs(int2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))abs(int3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))abs(int4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))acosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))acosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))acosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))acosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))asinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))asinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))asinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))asinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atan2(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atan2(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atan2(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atan2(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atan2pi(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atan2pi(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atan2pi(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atan2pi(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atanh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cbrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cbrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cbrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cbrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))ceil(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))ceil(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))ceil(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))ceil(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))clamp(float value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))clamp(float2 value, float2 min_value, float2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))clamp(float3 value, float3 min_value, float3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))clamp(float4 value, float4 min_value, float4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))clamp(float2 value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))clamp(float3 value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))clamp(float4 value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char __attribute__((const, overloadable))clamp(char value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char2 __attribute__((const, overloadable))clamp(char2 value, char2 min_value, char2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char3 __attribute__((const, overloadable))clamp(char3 value, char3 min_value, char3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char4 __attribute__((const, overloadable))clamp(char4 value, char4 min_value, char4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar __attribute__((const, overloadable))clamp(uchar value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short __attribute__((const, overloadable))clamp(short value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short2 __attribute__((const, overloadable))clamp(short2 value, short2 min_value, short2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short3 __attribute__((const, overloadable))clamp(short3 value, short3 min_value, short3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short4 __attribute__((const, overloadable))clamp(short4 value, short4 min_value, short4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort __attribute__((const, overloadable))clamp(ushort value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int __attribute__((const, overloadable))clamp(int value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int2 __attribute__((const, overloadable))clamp(int2 value, int2 min_value, int2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int3 __attribute__((const, overloadable))clamp(int3 value, int3 min_value, int3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int4 __attribute__((const, overloadable))clamp(int4 value, int4 min_value, int4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint __attribute__((const, overloadable))clamp(uint value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))clamp(uint2 value, uint2 min_value, uint2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))clamp(uint3 value, uint3 min_value, uint3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))clamp(uint4 value, uint4 min_value, uint4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long __attribute__((const, overloadable))clamp(long value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long2 __attribute__((const, overloadable))clamp(long2 value, long2 min_value, long2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long3 __attribute__((const, overloadable))clamp(long3 value, long3 min_value, long3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long4 __attribute__((const, overloadable))clamp(long4 value, long4 min_value, long4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong __attribute__((const, overloadable))clamp(ulong value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char2 __attribute__((const, overloadable))clamp(char2 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char3 __attribute__((const, overloadable))clamp(char3 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char4 __attribute__((const, overloadable))clamp(char4 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))clamp(uchar2 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))clamp(uchar3 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))clamp(uchar4 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short2 __attribute__((const, overloadable))clamp(short2 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short3 __attribute__((const, overloadable))clamp(short3 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short4 __attribute__((const, overloadable))clamp(short4 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))clamp(ushort2 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))clamp(ushort3 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))clamp(ushort4 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int2 __attribute__((const, overloadable))clamp(int2 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int3 __attribute__((const, overloadable))clamp(int3 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int4 __attribute__((const, overloadable))clamp(int4 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))clamp(uint2 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))clamp(uint3 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))clamp(uint4 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long2 __attribute__((const, overloadable))clamp(long2 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long3 __attribute__((const, overloadable))clamp(long3 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long4 __attribute__((const, overloadable))clamp(long4 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))clamp(ulong2 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))clamp(ulong3 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))clamp(ulong4 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char __attribute__((const, overloadable))clz(char value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))clz(char2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))clz(char3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))clz(char4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar __attribute__((const, overloadable))clz(uchar value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))clz(uchar2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))clz(uchar3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))clz(uchar4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short __attribute__((const, overloadable))clz(short value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))clz(short2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))clz(short3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))clz(short4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort __attribute__((const, overloadable))clz(ushort value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))clz(ushort2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))clz(ushort3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))clz(ushort4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int __attribute__((const, overloadable))clz(int value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))clz(int2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))clz(int3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))clz(int4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint __attribute__((const, overloadable))clz(uint value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))clz(uint2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))clz(uint3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))clz(uint4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to float2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to float3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to float4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to float2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to float3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to float4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to float2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to float3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to float4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to char2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to char3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to char4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to char2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to char3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to char4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to char2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to char3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to char4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to uchar2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to uchar3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to uchar4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to uchar2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to uchar3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to uchar4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to uchar2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to uchar3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to uchar4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to short2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to short3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to short4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to short2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to short3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to short4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to short2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to short3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to short4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to ushort2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to ushort3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to ushort4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to ushort2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to ushort3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to ushort4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to ushort2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to ushort3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to ushort4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to int2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to int3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to int4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to int2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to int3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to int4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to int2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to int3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to int4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to uint2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to uint3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to uint4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to uint2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to uint3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to uint4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to uint2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to uint3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to uint4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))copysign(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))copysign(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))copysign(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))copysign(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cos(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cos(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cos(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cos(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cospi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cospi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cospi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cospi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the cross product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cross(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the cross product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cross(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))degrees(float value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))degrees(float2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))degrees(float3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))degrees(float4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))erf(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))erf(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))erf(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))erf(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))erfc(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))erfc(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))erfc(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))erfc(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))exp(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))exp(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))exp(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))exp(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))exp10(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))exp10(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))exp10(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))exp10(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))exp2(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))exp2(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))exp2(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))exp2(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))expm1(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))expm1(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))expm1(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))expm1(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fabs(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fabs(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fabs(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fabs(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fast_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fast_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fast_normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fdim(float a, float b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fdim(float2 a, float2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fdim(float3 a, float3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fdim(float4 a, float4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))floor(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))floor(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))floor(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))floor(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fma(float a, float b, float c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fma(float2 a, float2 b, float2 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fma(float3 a, float3 b, float3 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fma(float4 a, float4 b, float4 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fmax(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmax(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmax(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmax(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmax(float2 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmax(float3 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmax(float4 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fmin(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmin(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmin(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmin(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmin(float2 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmin(float3 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmin(float4 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fmod(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmod(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmod(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmod(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))fract(float v, float* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))fract(float2 v, float2* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))fract(float3 v, float3* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))fract(float4 v, float4* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float __attribute__((const, overloadable))fract(float v) {
+ float unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float2 __attribute__((const, overloadable))fract(float2 v) {
+ float2 unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float3 __attribute__((const, overloadable))fract(float3 v) {
+ float3 unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float4 __attribute__((const, overloadable))fract(float4 v) {
+ float4 unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))frexp(float v, int* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))frexp(float2 v, int2* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))frexp(float3 v, int3* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))frexp(float4 v, int4* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))half_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))half_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))half_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))half_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))half_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))half_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))half_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))half_rsqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))half_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))half_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))half_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))half_sqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))hypot(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))hypot(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))hypot(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))hypot(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int __attribute__((const, overloadable))ilogb(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))ilogb(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))ilogb(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))ilogb(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))ldexp(float x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))ldexp(float2 x, int2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))ldexp(float3 x, int3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))ldexp(float4 x, int4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))ldexp(float2 x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))ldexp(float3 x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))ldexp(float4 x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))lgamma(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))lgamma(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))lgamma(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))lgamma(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))lgamma(float x, int* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))lgamma(float2 x, int2* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))lgamma(float3 x, int3* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))lgamma(float4 x, int4* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log10(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log10(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log10(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log10(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log1p(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log1p(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log1p(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log1p(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log2(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log2(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log2(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log2(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))logb(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))logb(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))logb(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))logb(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))mad(float a, float b, float c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))mad(float2 a, float2 b, float2 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))mad(float3 a, float3 b, float3 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))mad(float4 a, float4 b, float4 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))max(float, float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))max(float2, float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))max(float3, float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))max(float4, float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char __attribute__((const, overloadable))max(char v1, char v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar __attribute__((const, overloadable))max(uchar v1, uchar v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short __attribute__((const, overloadable))max(short v1, short v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort __attribute__((const, overloadable))max(ushort v1, ushort v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int __attribute__((const, overloadable))max(int v1, int v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint __attribute__((const, overloadable))max(uint v1, uint v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char2 __attribute__((const, overloadable))max(char2 v1, char2 v2) {
+ char2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar2 __attribute__((const, overloadable))max(uchar2 v1, uchar2 v2) {
+ uchar2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short2 __attribute__((const, overloadable))max(short2 v1, short2 v2) {
+ short2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort2 __attribute__((const, overloadable))max(ushort2 v1, ushort2 v2) {
+ ushort2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int2 __attribute__((const, overloadable))max(int2 v1, int2 v2) {
+ int2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint2 __attribute__((const, overloadable))max(uint2 v1, uint2 v2) {
+ uint2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char3 __attribute__((const, overloadable))max(char3 v1, char3 v2) {
+ char3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar3 __attribute__((const, overloadable))max(uchar3 v1, uchar3 v2) {
+ uchar3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short3 __attribute__((const, overloadable))max(short3 v1, short3 v2) {
+ short3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort3 __attribute__((const, overloadable))max(ushort3 v1, ushort3 v2) {
+ ushort3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int3 __attribute__((const, overloadable))max(int3 v1, int3 v2) {
+ int3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint3 __attribute__((const, overloadable))max(uint3 v1, uint3 v2) {
+ uint3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char4 __attribute__((const, overloadable))max(char4 v1, char4 v2) {
+ char4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar4 __attribute__((const, overloadable))max(uchar4 v1, uchar4 v2) {
+ uchar4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short4 __attribute__((const, overloadable))max(short4 v1, short4 v2) {
+ short4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort4 __attribute__((const, overloadable))max(ushort4 v1, ushort4 v2) {
+ ushort4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int4 __attribute__((const, overloadable))max(int4 v1, int4 v2) {
+ int4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint4 __attribute__((const, overloadable))max(uint4 v1, uint4 v2) {
+ uint4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char __attribute__((const, overloadable))max(char v1, char v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))max(char2 v1, char2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))max(char3 v1, char3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))max(char4 v1, char4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar __attribute__((const, overloadable))max(uchar v1, uchar v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))max(uchar2 v1, uchar2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))max(uchar3 v1, uchar3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))max(uchar4 v1, uchar4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short __attribute__((const, overloadable))max(short v1, short v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))max(short2 v1, short2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))max(short3 v1, short3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))max(short4 v1, short4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort __attribute__((const, overloadable))max(ushort v1, ushort v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))max(ushort2 v1, ushort2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))max(ushort3 v1, ushort3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))max(ushort4 v1, ushort4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int __attribute__((const, overloadable))max(int v1, int v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))max(int2 v1, int2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))max(int3 v1, int3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))max(int4 v1, int4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint __attribute__((const, overloadable))max(uint v1, uint v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))max(uint2 v1, uint2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))max(uint3 v1, uint3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))max(uint4 v1, uint4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long __attribute__((const, overloadable))max(long v1, long v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))max(long2 v1, long2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))max(long3 v1, long3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))max(long4 v1, long4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong __attribute__((const, overloadable))max(ulong v1, ulong v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))max(ulong2 v1, ulong2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))max(ulong3 v1, ulong3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))max(ulong4 v1, ulong4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))min(float, float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))min(float2, float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))min(float3, float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))min(float4, float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char __attribute__((const, overloadable))min(char v1, char v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar __attribute__((const, overloadable))min(uchar v1, uchar v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short __attribute__((const, overloadable))min(short v1, short v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort __attribute__((const, overloadable))min(ushort v1, ushort v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int __attribute__((const, overloadable))min(int v1, int v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint __attribute__((const, overloadable))min(uint v1, uint v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char2 __attribute__((const, overloadable))min(char2 v1, char2 v2) {
+ char2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar2 __attribute__((const, overloadable))min(uchar2 v1, uchar2 v2) {
+ uchar2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short2 __attribute__((const, overloadable))min(short2 v1, short2 v2) {
+ short2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort2 __attribute__((const, overloadable))min(ushort2 v1, ushort2 v2) {
+ ushort2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int2 __attribute__((const, overloadable))min(int2 v1, int2 v2) {
+ int2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint2 __attribute__((const, overloadable))min(uint2 v1, uint2 v2) {
+ uint2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char3 __attribute__((const, overloadable))min(char3 v1, char3 v2) {
+ char3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar3 __attribute__((const, overloadable))min(uchar3 v1, uchar3 v2) {
+ uchar3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short3 __attribute__((const, overloadable))min(short3 v1, short3 v2) {
+ short3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort3 __attribute__((const, overloadable))min(ushort3 v1, ushort3 v2) {
+ ushort3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int3 __attribute__((const, overloadable))min(int3 v1, int3 v2) {
+ int3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint3 __attribute__((const, overloadable))min(uint3 v1, uint3 v2) {
+ uint3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char4 __attribute__((const, overloadable))min(char4 v1, char4 v2) {
+ char4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar4 __attribute__((const, overloadable))min(uchar4 v1, uchar4 v2) {
+ uchar4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short4 __attribute__((const, overloadable))min(short4 v1, short4 v2) {
+ short4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort4 __attribute__((const, overloadable))min(ushort4 v1, ushort4 v2) {
+ ushort4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int4 __attribute__((const, overloadable))min(int4 v1, int4 v2) {
+ int4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint4 __attribute__((const, overloadable))min(uint4 v1, uint4 v2) {
+ uint4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char __attribute__((const, overloadable))min(char v1, char v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))min(char2 v1, char2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))min(char3 v1, char3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))min(char4 v1, char4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar __attribute__((const, overloadable))min(uchar v1, uchar v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))min(uchar2 v1, uchar2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))min(uchar3 v1, uchar3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))min(uchar4 v1, uchar4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short __attribute__((const, overloadable))min(short v1, short v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))min(short2 v1, short2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))min(short3 v1, short3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))min(short4 v1, short4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort __attribute__((const, overloadable))min(ushort v1, ushort v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))min(ushort2 v1, ushort2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))min(ushort3 v1, ushort3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))min(ushort4 v1, ushort4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int __attribute__((const, overloadable))min(int v1, int v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))min(int2 v1, int2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))min(int3 v1, int3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))min(int4 v1, int4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint __attribute__((const, overloadable))min(uint v1, uint v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))min(uint2 v1, uint2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))min(uint3 v1, uint3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))min(uint4 v1, uint4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long __attribute__((const, overloadable))min(long v1, long v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))min(long2 v1, long2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))min(long3 v1, long3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))min(long4 v1, long4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong __attribute__((const, overloadable))min(ulong v1, ulong v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))min(ulong2 v1, ulong2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))min(ulong3 v1, ulong3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))min(ulong4 v1, ulong4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))mix(float start, float stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))mix(float2 start, float2 stop, float2 amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))mix(float3 start, float3 stop, float3 amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))mix(float4 start, float4 stop, float4 amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))mix(float2 start, float2 stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))mix(float3 start, float3 stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))mix(float4 start, float4 stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))modf(float x, float* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))modf(float2 x, float2* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))modf(float3 x, float3* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))modf(float4 x, float4* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * generate a nan
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))nan(uint);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan2(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan2(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan2(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan2(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan2pi(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan2pi(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan2pi(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan2pi(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atanh(float in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atanh(float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atanh(float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atanh(float4 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cbrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cbrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cbrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cbrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cos(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cos(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cos(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cos(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cospi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cospi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cospi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cospi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_divide(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_divide(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_divide(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_divide(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_exp(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_exp(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_exp(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_exp(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_exp10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_exp10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_exp10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_exp10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_exp2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_exp2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_exp2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_exp2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_expm1(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_expm1(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_expm1(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_expm1(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_hypot(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_hypot(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_hypot(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_hypot(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log1p(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log1p(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log1p(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log1p(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_powr(float v, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_powr(float2 v, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_powr(float3 v, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_powr(float4 v, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_rsqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_rsqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_rsqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_rsqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sin(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sin(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sin(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sin(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((overloadable))native_sincos(float v, float* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((overloadable))native_sincos(float2 v, float2* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((overloadable))native_sincos(float3 v, float3* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((overloadable))native_sincos(float4 v, float4* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sinpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sinpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sinpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sinpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tan(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tan(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tan(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tan(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tanh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tanh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tanh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tanh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tanpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tanpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tanpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tanpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))nextafter(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))nextafter(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))nextafter(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))nextafter(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))pow(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))pow(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))pow(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))pow(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))pown(float x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))pown(float2 x, int2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))pown(float3 x, int3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))pown(float4 x, int4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))powr(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))powr(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))powr(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))powr(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))radians(float value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))radians(float2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))radians(float3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))radians(float4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))remainder(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))remainder(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))remainder(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))remainder(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))remquo(float b, float c, int* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))remquo(float2 b, float2 c, int2* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))remquo(float3 b, float3 c, int3* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))remquo(float4 b, float4 c, int4* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))rint(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))rint(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))rint(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))rint(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))round(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))round(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))round(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))round(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))rsqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))rsqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))rsqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))rsqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sign(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sign(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sign(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sign(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sin(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sin(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sin(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sin(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))sincos(float v, float* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))sincos(float2 v, float2* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))sincos(float3 v, float3* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))sincos(float4 v, float4* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sinpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sinpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sinpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sinpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))step(float edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))step(float2 edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))step(float3 edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))step(float4 edge, float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))step(float2 edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))step(float3 edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))step(float4 edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))step(float edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))step(float edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))step(float edge, float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tan(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tan(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tan(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tan(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tanh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tanh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tanh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tanh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tanpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tanpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tanpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tanpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tgamma(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tgamma(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tgamma(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tgamma(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))trunc(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))trunc(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))trunc(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))trunc(float4);
+#endif
+
+#endif // __rs_core_math_rsh__

diff --git a/21.1.2/include/rs_debug.rsh b/21.1.2/include/rs_debug.rsh
new file mode 100644
index 0000000..7a13c9d
--- /dev/null
+++ b/21.1.2/include/rs_debug.rsh

@@ -0,0 +1,267 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_debug.rsh
+ *  \brief Utility debugging routines
+ *
+ *  Routines intended to be used during application developement.  These should
+ *  not be used in shipping applications.  All print a string and value pair to
+ *  the standard log.
+ *
+ */
+
+#ifndef __RS_DEBUG_RSH__
+#define __RS_DEBUG_RSH__
+
+
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float, float, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float, float, float, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, double);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const rs_matrix4x4 *);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const rs_matrix3x3 *);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const rs_matrix2x2 *);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned long long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const void *);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned char);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned short);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong4);
+#endif  // (defined(RS_VERSION) && (RS_VERSION >= 17))
+
+#define RS_DEBUG(a) rsDebug(#a, a)
+#define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
+
+#endif

diff --git a/21.1.2/include/rs_element.rsh b/21.1.2/include/rs_element.rsh
new file mode 100644
index 0000000..0230f10
--- /dev/null
+++ b/21.1.2/include/rs_element.rsh

@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_element.rsh
+ *  \brief Element routines
+ *
+ *
+ */
+
+#ifndef __RS_ELEMENT_RSH__
+#define __RS_ELEMENT_RSH__
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Elements could be simple, such as an int or a float, or a
+ * structure with multiple sub elements, such as a collection of
+ * floats, float2, float4. This function returns zero for simple
+ * elements or the number of sub-elements otherwise.
+ *
+ * @param e element to get data from
+ * @return number of sub-elements in this element
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementCount(rs_element e);
+
+/**
+ * For complex elements, this function will return the
+ * sub-element at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element to return
+ * @return sub-element in this element at given index
+ */
+extern rs_element __attribute__((overloadable))
+    rsElementGetSubElement(rs_element, uint32_t index);
+
+/**
+ * For complex elements, this function will return the length of
+ * sub-element name at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element to return
+ * @return length of the sub-element name including the null
+ *         terminator (size of buffer needed to write the name)
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementNameLength(rs_element e, uint32_t index);
+
+/**
+ * For complex elements, this function will return the
+ * sub-element name at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element
+ * @param name array to store the name into
+ * @param nameLength length of the provided name array
+ * @return number of characters actually written, excluding the
+ *         null terminator
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementName(rs_element e, uint32_t index, char *name, uint32_t nameLength);
+
+/**
+ * For complex elements, some sub-elements could be statically
+ * sized arrays. This function will return the array size for
+ * sub-element at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element
+ * @return array size of sub-element in this element at given
+ *         index
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementArraySize(rs_element e, uint32_t index);
+
+/**
+ * This function specifies the location of a sub-element within
+ * the element
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element
+ * @return offset in bytes of sub-element in this element at
+ *         given index
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementOffsetBytes(rs_element e, uint32_t index);
+
+/**
+ * Returns the size of element in bytes
+ *
+ * @param e element to get data from
+ * @return total size of the element in bytes
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetBytesSize(rs_element e);
+
+/**
+ * Returns the element's data type
+ *
+ * @param e element to get data from
+ * @return element's data type
+ */
+extern rs_data_type __attribute__((overloadable))
+    rsElementGetDataType(rs_element e);
+
+/**
+ * Returns the element's data kind
+ *
+ * @param e element to get data from
+ * @return element's data size
+ */
+extern rs_data_kind __attribute__((overloadable))
+    rsElementGetDataKind(rs_element e);
+
+/**
+ * Returns the element's vector size
+ *
+ * @param e element to get data from
+ * @return length of the element vector (for float2, float3,
+ *         etc.)
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetVectorSize(rs_element e);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_ELEMENT_RSH__
+

diff --git a/21.1.2/include/rs_graphics.rsh b/21.1.2/include/rs_graphics.rsh
new file mode 100644
index 0000000..1fcb7ed
--- /dev/null
+++ b/21.1.2/include/rs_graphics.rsh

@@ -0,0 +1,426 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_graphics.rsh
+ *  \brief RenderScript graphics API
+ *
+ *  A set of graphics functions used by RenderScript.
+ *
+ */
+#ifndef __RS_GRAPHICS_RSH__
+#define __RS_GRAPHICS_RSH__
+
+#ifdef __LP64__
+//#error "RenderScript graphics is deprecated and not supported in 64bit mode."
+#else
+
+#include "rs_mesh.rsh"
+#include "rs_program.rsh"
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+/**
+ * Set the color target used for all subsequent rendering calls
+ * @param colorTarget
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgBindColorTarget(rs_allocation colorTarget, uint slot);
+
+/**
+ * Clear the previously set color target
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgClearColorTarget(uint slot);
+
+/**
+ * Set the depth target used for all subsequent rendering calls
+ * @param depthTarget
+ */
+extern void __attribute__((overloadable))
+    rsgBindDepthTarget(rs_allocation depthTarget);
+
+/**
+ * Clear the previously set depth target
+ */
+extern void __attribute__((overloadable))
+    rsgClearDepthTarget(void);
+
+/**
+ * Clear all color and depth targets and resume rendering into
+ * the framebuffer
+ */
+extern void __attribute__((overloadable))
+    rsgClearAllRenderTargets(void);
+
+/**
+ * Force RenderScript to finish all rendering commands
+ */
+extern uint __attribute__((overloadable))
+    rsgFinish(void);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+/**
+ * Bind a new ProgramFragment to the rendering context.
+ *
+ * @param pf
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramFragment(rs_program_fragment pf);
+
+/**
+ * Bind a new ProgramStore to the rendering context.
+ *
+ * @param ps
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramStore(rs_program_store ps);
+
+/**
+ * Bind a new ProgramVertex to the rendering context.
+ *
+ * @param pv
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramVertex(rs_program_vertex pv);
+
+/**
+ * Bind a new ProgramRaster to the rendering context.
+ *
+ * @param pr
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramRaster(rs_program_raster pr);
+
+/**
+ * Bind a new Sampler object to a ProgramFragment.  The sampler will
+ * operate on the texture bound at the matching slot.
+ *
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgBindSampler(rs_program_fragment, uint slot, rs_sampler);
+
+/**
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid texture for the Program.  The sampling
+ * of the texture will be controled by the Sampler bound at the
+ * matching slot.
+ *
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgBindTexture(rs_program_fragment, uint slot, rs_allocation);
+
+/**
+ * Load the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param proj projection matrix
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexLoadProjectionMatrix(const rs_matrix4x4 *proj);
+/**
+ * Load the model matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param model model matrix
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexLoadModelMatrix(const rs_matrix4x4 *model);
+/**
+ * Load the texture matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param tex texture matrix
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexLoadTextureMatrix(const rs_matrix4x4 *tex);
+/**
+ * Get the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param proj matrix to store the current projection matrix into
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexGetProjectionMatrix(rs_matrix4x4 *proj);
+
+/**
+ * Set the constant color for a fixed function emulation program.
+ *
+ * @param pf
+ * @param r
+ * @param g
+ * @param b
+ * @param a
+ */
+extern void __attribute__((overloadable))
+    rsgProgramFragmentConstantColor(rs_program_fragment pf, float r, float g, float b, float a);
+
+/**
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid constant input for the Program.
+ *
+ * @param ps program object
+ * @param slot index of the constant buffer on the program
+ * @param c constants to bind
+ */
+extern void __attribute__((overloadable))
+    rsgBindConstant(rs_program_fragment ps, uint slot, rs_allocation c);
+
+/**
+ * Bind a new Allocation object to a ProgramVertex.  The
+ * Allocation must be a valid constant input for the Program.
+ *
+ * @param pv program object
+ * @param slot index of the constant buffer on the program
+ * @param c constants to bind
+ */
+extern void __attribute__((overloadable))
+    rsgBindConstant(rs_program_vertex pv, uint slot, rs_allocation c);
+
+/**
+ * Get the width of the current rendering surface.
+ *
+ * @return uint
+ */
+extern uint __attribute__((overloadable))
+    rsgGetWidth(void);
+
+/**
+ * Get the height of the current rendering surface.
+ *
+ * @return uint
+ */
+extern uint __attribute__((overloadable))
+    rsgGetHeight(void);
+
+
+/**
+ * Sync the contents of an allocation from its SCRIPT memory space to its HW
+ * memory spaces.
+ *
+ * @param alloc
+ */
+extern void __attribute__((overloadable))
+    rsgAllocationSyncAll(rs_allocation alloc);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * Sync the contents of an allocation from memory space
+ * specified by source.
+ *
+ * @param alloc
+ * @param source
+ */
+extern void __attribute__((overloadable))
+    rsgAllocationSyncAll(rs_allocation alloc,
+                         rs_allocation_usage_type source);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+/**
+ * Low performance utility function for drawing a simple rectangle.  Not
+ * intended for drawing large quantities of geometry.
+ *
+ * @param x1
+ * @param y1
+ * @param x2
+ * @param y2
+ * @param z
+ */
+extern void __attribute__((overloadable))
+    rsgDrawRect(float x1, float y1, float x2, float y2, float z);
+
+/**
+ * Low performance utility function for drawing a simple quad.  Not intended for
+ * drawing large quantities of geometry.
+ *
+ * @param x1
+ * @param y1
+ * @param z1
+ * @param x2
+ * @param y2
+ * @param z2
+ * @param x3
+ * @param y3
+ * @param z3
+ * @param x4
+ * @param y4
+ * @param z4
+ */
+extern void __attribute__((overloadable))
+    rsgDrawQuad(float x1, float y1, float z1,
+                float x2, float y2, float z2,
+                float x3, float y3, float z3,
+                float x4, float y4, float z4);
+
+
+/**
+ * Low performance utility function for drawing a textured quad.  Not intended
+ * for drawing large quantities of geometry.
+ *
+ * @param x1
+ * @param y1
+ * @param z1
+ * @param u1
+ * @param v1
+ * @param x2
+ * @param y2
+ * @param z2
+ * @param u2
+ * @param v2
+ * @param x3
+ * @param y3
+ * @param z3
+ * @param u3
+ * @param v3
+ * @param x4
+ * @param y4
+ * @param z4
+ * @param u4
+ * @param v4
+ */
+extern void __attribute__((overloadable))
+    rsgDrawQuadTexCoords(float x1, float y1, float z1, float u1, float v1,
+                         float x2, float y2, float z2, float u2, float v2,
+                         float x3, float y3, float z3, float u3, float v3,
+                         float x4, float y4, float z4, float u4, float v4);
+
+
+/**
+ * Low performance function for drawing rectangles in screenspace.  This
+ * function uses the default passthough ProgramVertex.  Any bound ProgramVertex
+ * is ignored.  This function has considerable overhead and should not be used
+ * for drawing in shipping applications.
+ *
+ * @param x
+ * @param y
+ * @param z
+ * @param w
+ * @param h
+ */
+extern void __attribute__((overloadable))
+    rsgDrawSpriteScreenspace(float x, float y, float z, float w, float h);
+
+extern void __attribute__((overloadable))
+    rsgDrawPath(rs_path p);
+
+/**
+ * Draw a mesh using the current context state.  The whole mesh is
+ * rendered.
+ *
+ * @param ism
+ */
+extern void __attribute__((overloadable))
+    rsgDrawMesh(rs_mesh ism);
+/**
+ * Draw part of a mesh using the current context state.
+ * @param ism mesh object to render
+ * @param primitiveIndex for meshes that contain multiple primitive groups
+ *        this parameter specifies the index of the group to draw.
+ */
+extern void __attribute__((overloadable))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex);
+/**
+ * Draw specified index range of part of a mesh using the current context state.
+ * @param ism mesh object to render
+ * @param primitiveIndex for meshes that contain multiple primitive groups
+ *        this parameter specifies the index of the group to draw.
+ * @param start starting index in the range
+ * @param len number of indices to draw
+ */
+extern void __attribute__((overloadable))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex, uint start, uint len);
+
+/**
+ * Clears the rendering surface to the specified color.
+ *
+ * @param r
+ * @param g
+ * @param b
+ * @param a
+ */
+extern void __attribute__((overloadable))
+    rsgClearColor(float r, float g, float b, float a);
+
+/**
+ * Clears the depth suface to the specified value.
+ */
+extern void __attribute__((overloadable))
+    rsgClearDepth(float value);
+/**
+ * Draws text given a string and location
+ */
+extern void __attribute__((overloadable))
+    rsgDrawText(const char *, int x, int y);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsgDrawText(rs_allocation, int x, int y);
+/**
+ * Binds the font object to be used for all subsequent font rendering calls
+ * @param font object to bind
+ */
+extern void __attribute__((overloadable))
+    rsgBindFont(rs_font font);
+/**
+ * Sets the font color for all subsequent rendering calls
+ * @param r red component
+ * @param g green component
+ * @param b blue component
+ * @param a alpha component
+ */
+extern void __attribute__((overloadable))
+    rsgFontColor(float r, float g, float b, float a);
+/**
+ * Returns the bounding box of the text relative to (0, 0)
+ * Any of left, right, top, bottom could be NULL
+ */
+extern void __attribute__((overloadable))
+    rsgMeasureText(const char *, int *left, int *right, int *top, int *bottom);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsgMeasureText(rs_allocation, int *left, int *right, int *top, int *bottom);
+/**
+ * Computes an axis aligned bounding box of a mesh object
+ */
+extern void __attribute__((overloadable))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float *minX, float *minY, float *minZ,
+                                                float *maxX, float *maxY, float *maxZ);
+/**
+ * \overload
+ */
+__inline__ static void __attribute__((overloadable, always_inline))
+rsgMeshComputeBoundingBox(rs_mesh mesh, float3 *bBoxMin, float3 *bBoxMax) {
+    float x1, y1, z1, x2, y2, z2;
+    rsgMeshComputeBoundingBox(mesh, &x1, &y1, &z1, &x2, &y2, &z2);
+    bBoxMin->x = x1;
+    bBoxMin->y = y1;
+    bBoxMin->z = z1;
+    bBoxMax->x = x2;
+    bBoxMax->y = y2;
+    bBoxMax->z = z2;
+}
+
+#endif //__LP64__
+#endif
+

diff --git a/21.1.2/include/rs_math.rsh b/21.1.2/include/rs_math.rsh
new file mode 100644
index 0000000..4d3124c
--- /dev/null
+++ b/21.1.2/include/rs_math.rsh

@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_math.rsh
+ *  \brief todo-jsams
+ *
+ *  todo-jsams
+ *
+ */
+
+#ifndef __RS_MATH_RSH__
+#define __RS_MATH_RSH__
+
+
+/**
+ * Return a random value between 0 (or min_value) and max_malue.
+ */
+extern int __attribute__((overloadable))
+    rsRand(int max_value);
+/**
+ * \overload
+ */
+extern int __attribute__((overloadable))
+    rsRand(int min_value, int max_value);
+/**
+ * \overload
+ */
+extern float __attribute__((overloadable))
+    rsRand(float max_value);
+/**
+ * \overload
+ */
+extern float __attribute__((overloadable))
+    rsRand(float min_value, float max_value);
+
+/**
+ * Returns the fractional part of a float
+ */
+extern float __attribute__((const, overloadable))
+    rsFrac(float);
+
+
+/////////////////////////////////////////////////////
+// int ops
+/////////////////////////////////////////////////////
+
+/**
+ * Clamp the value amount between low and high.
+ *
+ * @param amount  The value to clamp
+ * @param low
+ * @param high
+ */
+_RS_RUNTIME uint __attribute__((const, overloadable, always_inline)) rsClamp(uint amount, uint low, uint high);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME int __attribute__((const, overloadable, always_inline)) rsClamp(int amount, int low, int high);
+/**
+ * \overload
+ */
+_RS_RUNTIME ushort __attribute__((const, overloadable, always_inline)) rsClamp(ushort amount, ushort low, ushort high);
+/**
+ * \overload
+ */
+_RS_RUNTIME short __attribute__((const, overloadable, always_inline)) rsClamp(short amount, short low, short high);
+/**
+ * \overload
+ */
+_RS_RUNTIME uchar __attribute__((const, overloadable, always_inline)) rsClamp(uchar amount, uchar low, uchar high);
+/**
+ * \overload
+ */
+_RS_RUNTIME char __attribute__((const, overloadable, always_inline)) rsClamp(char amount, char low, char high);
+
+
+/**
+ * Computes 6 frustum planes from the view projection matrix
+ * @param viewProj matrix to extract planes from
+ * @param left plane
+ * @param right plane
+ * @param top plane
+ * @param bottom plane
+ * @param near plane
+ * @param far plane
+ */
+__inline__ static void __attribute__((overloadable, always_inline))
+rsExtractFrustumPlanes(const rs_matrix4x4 *viewProj,
+                         float4 *left, float4 *right,
+                         float4 *top, float4 *bottom,
+                         float4 *near, float4 *far) {
+    // x y z w = a b c d in the plane equation
+    left->x = viewProj->m[3] + viewProj->m[0];
+    left->y = viewProj->m[7] + viewProj->m[4];
+    left->z = viewProj->m[11] + viewProj->m[8];
+    left->w = viewProj->m[15] + viewProj->m[12];
+
+    right->x = viewProj->m[3] - viewProj->m[0];
+    right->y = viewProj->m[7] - viewProj->m[4];
+    right->z = viewProj->m[11] - viewProj->m[8];
+    right->w = viewProj->m[15] - viewProj->m[12];
+
+    top->x = viewProj->m[3] - viewProj->m[1];
+    top->y = viewProj->m[7] - viewProj->m[5];
+    top->z = viewProj->m[11] - viewProj->m[9];
+    top->w = viewProj->m[15] - viewProj->m[13];
+
+    bottom->x = viewProj->m[3] + viewProj->m[1];
+    bottom->y = viewProj->m[7] + viewProj->m[5];
+    bottom->z = viewProj->m[11] + viewProj->m[9];
+    bottom->w = viewProj->m[15] + viewProj->m[13];
+
+    near->x = viewProj->m[3] + viewProj->m[2];
+    near->y = viewProj->m[7] + viewProj->m[6];
+    near->z = viewProj->m[11] + viewProj->m[10];
+    near->w = viewProj->m[15] + viewProj->m[14];
+
+    far->x = viewProj->m[3] - viewProj->m[2];
+    far->y = viewProj->m[7] - viewProj->m[6];
+    far->z = viewProj->m[11] - viewProj->m[10];
+    far->w = viewProj->m[15] - viewProj->m[14];
+
+    float len = length(left->xyz);
+    *left /= len;
+    len = length(right->xyz);
+    *right /= len;
+    len = length(top->xyz);
+    *top /= len;
+    len = length(bottom->xyz);
+    *bottom /= len;
+    len = length(near->xyz);
+    *near /= len;
+    len = length(far->xyz);
+    *far /= len;
+}
+
+/**
+ * Checks if a sphere is withing the 6 frustum planes
+ * @param sphere float4 representing the sphere
+ * @param left plane
+ * @param right plane
+ * @param top plane
+ * @param bottom plane
+ * @param near plane
+ * @param far plane
+ */
+__inline__ static bool __attribute__((overloadable, always_inline))
+rsIsSphereInFrustum(float4 *sphere,
+                      float4 *left, float4 *right,
+                      float4 *top, float4 *bottom,
+                      float4 *near, float4 *far) {
+
+    float distToCenter = dot(left->xyz, sphere->xyz) + left->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(right->xyz, sphere->xyz) + right->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(top->xyz, sphere->xyz) + top->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(bottom->xyz, sphere->xyz) + bottom->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(near->xyz, sphere->xyz) + near->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(far->xyz, sphere->xyz) + far->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    return true;
+}
+
+
+/**
+ * Pack floating point (0-1) RGB values into a uchar4.  The alpha component is
+ * set to 255 (1.0).
+ *
+ * @param r
+ * @param g
+ * @param b
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float r, float g, float b);
+
+/**
+ * Pack floating point (0-1) RGBA values into a uchar4.
+ *
+ * @param r
+ * @param g
+ * @param b
+ * @param a
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float r, float g, float b, float a);
+
+/**
+ * Pack floating point (0-1) RGB values into a uchar4.  The alpha component is
+ * set to 255 (1.0).
+ *
+ * @param color
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float3 color);
+
+/**
+ * Pack floating point (0-1) RGBA values into a uchar4.
+ *
+ * @param color
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float4 color);
+
+/**
+ * Unpack a uchar4 color to float4.  The resulting float range will be (0-1).
+ *
+ * @param c
+ *
+ * @return float4
+ */
+_RS_RUNTIME float4 __attribute__((const)) rsUnpackColor8888(uchar4 c);
+
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+_RS_RUNTIME float4 __attribute__((const, overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
+
+#endif

diff --git a/21.1.2/include/rs_matrix.rsh b/21.1.2/include/rs_matrix.rsh
new file mode 100644
index 0000000..34b9532
--- /dev/null
+++ b/21.1.2/include/rs_matrix.rsh

@@ -0,0 +1,532 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_matrix.rsh
+ *  \brief Matrix functions.
+ *
+ * These functions let you manipulate square matrices of rank 2x2, 3x3, and 4x4.
+ * They are particularly useful for graphical transformations and are
+ * compatible with OpenGL.
+ *
+ * A few general notes:
+ *
+ * \li We use a zero-based index for rows and columns.  E.g. the last element of
+ * a \ref rs_matrix4x4 is found at (3, 3).
+ *
+ * \li RenderScript uses column-based vectors.  Transforming a vector is done by
+ * postmultiplying the vector, e.g. <em>(matrix * vector)</em>, as provided by
+ * \ref rsMatrixMultiply.
+ *
+ * \li To create a transformation matrix that performs two transformations at
+ * once, multiply the two source matrices, with the first transformation as the
+ * right argument.  E.g. to create a transformation matrix that applies the
+ * transformation \e s1 followed by \e s2, call
+ * </c>rsMatrixLoadMultiply(&combined, &s2, &s1)</c>.
+ * This derives from <em>s2 * (s1 * v)</em>, which is <em>(s2 * s1) * v</em>.
+ *
+ * \li We have two style of functions to create transformation matrices:
+ * rsMatrixLoad<em>Transformation</em> and rsMatrix<em>Transformation</em>.  The
+ * former style simply stores the transformation matrix in the first argument.
+ * The latter modifies a pre-existing transformation matrix so that the new
+ * transformation happens first.  E.g. if you call \ref rsMatrixTranslate
+ * on a matrix that already does a scaling, the resulting matrix when applied
+ * to a vector will first do the translation then the scaling.
+ *
+ */
+
+#ifndef __RS_MATRIX_RSH__
+#define __RS_MATRIX_RSH__
+
+/**
+ * Set an element of a matrix.
+ *
+ * @param m The matrix that will be modified.
+ * @param col The zero-based column of the element to be set.
+ * @param row The zero-based row of the element to be set.
+ * @param v The value to set.
+ *
+ * \warning The order of the column and row parameters may be
+ * unexpected.
+ *
+ * @return void
+ */
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix4x4 *m, uint32_t col, uint32_t row, float v);
+/**
+ * \overload
+ */
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix3x3 *m, uint32_t col, uint32_t row, float v);
+/**
+ * \overload
+ */
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix2x2 *m, uint32_t col, uint32_t row, float v);
+
+/**
+ * Returns one element of a matrix.
+ *
+ * @param m The matrix to extract the element from.
+ * @param col The zero-based column of the element to be extracted.
+ * @param row The zero-based row of the element to extracted.
+ *
+ * \warning The order of the column and row parameters may be
+ * unexpected.
+ *
+ * @return float
+ */
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix4x4 *m, uint32_t col, uint32_t row);
+/**
+ * \overload
+ */
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix3x3 *m, uint32_t col, uint32_t row);
+/**
+ * \overload
+ */
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix2x2 *m, uint32_t col, uint32_t row);
+
+/**
+ * Set the elements of a matrix to the identity matrix.
+ *
+ * @param m The matrix to set.
+ */
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix4x4 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix3x3 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix2x2 *m);
+
+/**
+ * Set the elements of a matrix from an array of floats.
+ *
+ * The array of floats should be in row-major order, i.e. the element a
+ * <em>row 0, column 0</em> should be first, followed by the element at
+ * <em>row 0, column 1</em>, etc.
+ *
+ * @param m The matrix to set.
+ * @param v The array of values to set the matrix to. These arrays should be
+ * 4, 9, or 16 floats long, depending on the matrix size.
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const float *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix3x3 *m, const float *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const float *v);
+/**
+ * Set the elements of a matrix from another matrix.
+ *
+ * If the source matrix is smaller than the destination, the rest of the
+ * destination is filled with elements of the identity matrix.  E.g.
+ * loading a rs_matrix2x2 into a rs_matrix4x4 will give:
+ *
+ * \htmlonly<table>
+ * <tr><td>m00</td><td>m01</td><td>0.0</td><td>0.0</td></tr>
+ * <tr><td>m10</td><td>m11</td><td>0.0</td><td>0.0</td></tr>
+ * <tr><td>0.0</td><td>0.0</td><td>1.0</td><td>0.0</td></tr>
+ * <tr><td>0.0</td><td>0.0</td><td>0.0</td><td>1.0</td></tr>
+ * </table>\endhtmlonly
+ *
+ * @param m The matrix to set.
+ * @param v The source matrix.
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix4x4 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix3x3 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix2x2 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix3x3 *m, const rs_matrix3x3 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const rs_matrix2x2 *v);
+
+/**
+ * Load a rotation matrix.
+ *
+ * This function creates a rotation matrix.  The axis of rotation is the
+ * <em>(x, y, z)</em> vector.
+ *
+ * To rotate a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * See http://en.wikipedia.org/wiki/Rotation_matrix .
+ *
+ * @param m The matrix to set.
+ * @param rot How much rotation to do, in degrees.
+ * @param x The x component of the vector that is the axis of rotation.
+ * @param y The y component of the vector that is the axis of rotation.
+ * @param z The z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
+
+/**
+ * Load a scale matrix.
+ *
+ * This function creates a scaling matrix, where each component of a
+ * vector is multiplied by a number.  This number can be negative.
+ *
+ * To scale a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param x The multiple to scale the x components by.
+ * @param y The multiple to scale the y components by.
+ * @param z The multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadScale(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Load a translation matrix.
+ *
+ * This function creates a translation matrix, where a
+ * number is added to each element of a vector.
+ *
+ * To translate a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param x The number to add to each x component.
+ * @param y The number to add to each y component.
+ * @param z The number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadTranslate(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Multiply two matrices.
+ *
+ * Sets \e m to the matrix product of <em>lhs * rhs</em>.
+ *
+ * To combine two 4x4 transformaton matrices, multiply the second transformation matrix
+ * by the first transformation matrix.  E.g. to create a transformation matrix that applies
+ * the transformation \e s1 followed by \e s2, call
+ * </c>rsMatrixLoadMultiply(&combined, &s2, &s1)</c>.
+ *
+ * \warning Prior to version 21, storing the result back into right matrix is not supported and
+ * will result in undefined behavior.  Use rsMatrixMulitply instead.   E.g. instead of doing
+ * rsMatrixLoadMultiply (&m2r, &m2r, &m2l), use rsMatrixMultiply (&m2r, &m2l).
+ * rsMatrixLoadMultiply (&m2l, &m2r, &m2l) works as expected.
+ *
+ * @param m The matrix to set.
+ * @param lhs The left matrix of the product.
+ * @param rhs The right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs);
+
+/**
+ * Multiply a matrix into another one.
+ *
+ * Sets \e m to the matrix product <em>m * rhs</em>.
+ *
+ * When combining two 4x4 transformation matrices using this function, the resulting
+ * matrix will correspond to performing the \e rhs transformation first followed by
+ * the original \e m transformation.
+ *
+ * @param m The left matrix of the product and the matrix to be set.
+ * @param rhs The right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *rhs);
+
+/**
+ * Multiply the matrix \e m with a rotation matrix.
+ *
+ * This function modifies a transformation matrix to first do a rotation.
+ * The axis of rotation is the <em>(x, y, z)</em> vector.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param rot How much rotation to do, in degrees.
+ * @param x The x component of the vector that is the axis of rotation.
+ * @param y The y component of the vector that is the axis of rotation.
+ * @param z The z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+rsMatrixRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
+
+/**
+ * Multiply the matrix \e m with a scaling matrix.
+ *
+ * This function modifies a transformation matrix to first do a scaling.
+ * When scaling, each component of a vector is multiplied by a number.
+ * This number can be negative.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param x The multiple to scale the x components by.
+ * @param y The multiple to scale the y components by.
+ * @param z The multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+rsMatrixScale(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Multiply the matrix \e m with a translation matrix.
+ *
+ * This function modifies a transformation matrix to first
+ * do a translation.  When translating, a number is added
+ * to each component of a vector.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param x The number to add to each x component.
+ * @param y The number to add to each y component.
+ * @param z The number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+rsMatrixTranslate(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Load an orthographic projection matrix.
+ *
+ * Constructs an orthographic projection matrix, transforming the box
+ * identified by the six clipping planes <em>left, right, bottom, top,
+ * near, far</em> into a unit cube with a corner at
+ * <em>(-1, -1, -1)</em> and the opposite at <em>(1, 1, 1)</em>.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * See https://en.wikipedia.org/wiki/Orthographic_projection .
+ *
+ * @param m The matrix to set.
+ * @param left
+ * @param right
+ * @param bottom
+ * @param top
+ * @param near
+ * @param far
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadOrtho(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
+
+/**
+ * Load a frustum projection matrix.
+ *
+ * Constructs a frustum projection matrix, transforming the box
+ * identified by the six clipping planes <em>left, right, bottom, top,
+ * near, far</em>.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param left
+ * @param right
+ * @param bottom
+ * @param top
+ * @param near
+ * @param far
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadFrustum(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
+
+/**
+ * Load a perspective projection matrix.
+ *
+ * Constructs a perspective projection matrix, assuming a symmetrical field of view.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param fovy Field of view, in degrees along the Y axis.
+ * @param aspect Ratio of x / y.
+ * @param near The near clipping plane.
+ * @param far The far clipping plane.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+#if !defined(RS_VERSION) || (RS_VERSION < 14)
+/**
+ * Multiply a vector by a matrix.
+ *
+ * Returns the post-multiplication of the vector by the matrix, ie. <em>m * in</em>.
+ *
+ * When multiplying a \e float3 to a \e rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix3x3, the vector is expanded with (0).
+ *
+ * This function is available in API version 10-13.  Starting with API 14,
+ * the function takes a const matrix as the first argument.
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float4 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float2 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix2x2 *m, float2 in);
+#else
+/**
+ * Multiply a vector by a matrix.
+ *
+ * Returns the post-multiplication of the vector of the matrix, i.e. <em>m * in</em>.
+ *
+ * When multiplying a \e float3 to a \e rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix3x3, the vector is expanded with (0).
+ *
+ * This function is available starting with API version 14.
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix4x4 *m, float4 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix4x4 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix4x4 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix3x3 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix3x3 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float2 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix2x2 *m, float2 in);
+#endif
+
+
+/**
+ * Inverts a matrix in place.
+ *
+ * Returns true if the matrix was successfully inverted.
+ *
+ * @param m The matrix to invert.
+ */
+extern bool __attribute__((overloadable)) rsMatrixInverse(rs_matrix4x4 *m);
+
+/**
+ * Inverts and transpose a matrix in place.
+ *
+ * The matrix is first inverted then transposed.
+ * Returns true if the matrix was successfully inverted.
+ *
+ * @param m The matrix to modify.
+ */
+extern bool __attribute__((overloadable)) rsMatrixInverseTranspose(rs_matrix4x4 *m);
+
+/**
+ * Transpose the matrix m in place.
+ *
+ * @param m The matrix to transpose.
+ */
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix4x4 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix3x3 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix2x2 *m);
+
+
+#endif

diff --git a/21.1.2/include/rs_mesh.rsh b/21.1.2/include/rs_mesh.rsh
new file mode 100644
index 0000000..0ecd786
--- /dev/null
+++ b/21.1.2/include/rs_mesh.rsh

@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_mesh.rsh
+ *  \brief Mesh routines
+ *
+ *
+ */
+
+#ifndef __RS_MESH_RSH__
+#define __RS_MESH_RSH__
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Returns the number of allocations in the mesh that contain
+ * vertex data
+ *
+ * @param m mesh to get data from
+ * @return number of allocations in the mesh that contain vertex
+ *         data
+ */
+extern uint32_t __attribute__((overloadable))
+    rsgMeshGetVertexAllocationCount(rs_mesh m);
+
+/**
+ * Meshes could have multiple index sets, this function returns
+ * the number.
+ *
+ * @param m mesh to get data from
+ * @return number of primitive groups in the mesh. This would
+ *         include simple primitives as well as allocations
+ *         containing index data
+ */
+extern uint32_t __attribute__((overloadable))
+    rsgMeshGetPrimitiveCount(rs_mesh m);
+
+/**
+ * Returns an allocation that is part of the mesh and contains
+ * vertex data, e.g. positions, normals, texcoords
+ *
+ * @param m mesh to get data from
+ * @param index index of the vertex allocation
+ * @return allocation containing vertex data
+ */
+extern rs_allocation __attribute__((overloadable))
+    rsgMeshGetVertexAllocation(rs_mesh m, uint32_t index);
+
+/**
+ * Returns an allocation containing index data or a null
+ * allocation if only the primitive is specified
+ *
+ * @param m mesh to get data from
+ * @param index index of the index allocation
+ * @return allocation containing index data
+ */
+extern rs_allocation __attribute__((overloadable))
+    rsgMeshGetIndexAllocation(rs_mesh m, uint32_t index);
+
+/**
+ * Returns the primitive describing how a part of the mesh is
+ * rendered
+ *
+ * @param m mesh to get data from
+ * @param index index of the primitive
+ * @return primitive describing how the mesh is rendered
+ */
+extern rs_primitive __attribute__((overloadable))
+    rsgMeshGetPrimitive(rs_mesh m, uint32_t index);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_MESH_RSH__
+

diff --git a/21.1.2/include/rs_object.rsh b/21.1.2/include/rs_object.rsh
new file mode 100644
index 0000000..ed6423b
--- /dev/null
+++ b/21.1.2/include/rs_object.rsh

@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_object.rsh
+ *  \brief Object routines
+ *
+ *
+ */
+
+#ifndef __RS_OBJECT_RSH__
+#define __RS_OBJECT_RSH__
+
+
+/**
+ * Copy reference to the specified object.
+ *
+ * @param dst
+ * @param src
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_element *dst, rs_element src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_type *dst, rs_type src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_allocation *dst, rs_allocation src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_sampler *dst, rs_sampler src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_script *dst, rs_script src);
+
+#ifndef __LP64__
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_path *dst, rs_path src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_mesh *dst, rs_mesh src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_fragment *dst, rs_program_fragment src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_vertex *dst, rs_program_vertex src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_raster *dst, rs_program_raster src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_store *dst, rs_program_store src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_font *dst, rs_font src);
+#endif //__LP64__
+
+/**
+ * Sets the object to NULL.
+ *
+ * @return bool
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_element *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_type *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_allocation *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_sampler *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_script *dst);
+
+
+#ifndef __LP64__
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_path *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_mesh *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_fragment *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_vertex *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_raster *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_store *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_font *dst);
+#endif //__LP64__
+
+
+/**
+ * Tests if the object is valid.  Returns true if the object is valid, false if
+ * it is NULL.
+ *
+ * @return bool
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_element);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_type);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_allocation);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_sampler);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_script);
+
+#ifndef __LP64__
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_path);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_mesh);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_fragment);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_vertex);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_raster);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_store);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_font);
+#endif //__LP64__
+
+#endif

diff --git a/21.1.2/include/rs_program.rsh b/21.1.2/include/rs_program.rsh
new file mode 100644
index 0000000..299aae6
--- /dev/null
+++ b/21.1.2/include/rs_program.rsh

@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_program.rsh
+ *  \brief Program object routines
+ *
+ *
+ */
+
+#ifndef __RS_PROGRAM_RSH__
+#define __RS_PROGRAM_RSH__
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Get program store depth function
+ *
+ * @param ps program store to query
+ */
+extern rs_depth_func __attribute__((overloadable))
+    rsgProgramStoreGetDepthFunc(rs_program_store ps);
+
+/**
+ * Get program store depth mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsDepthMaskEnabled(rs_program_store ps);
+/**
+ * Get program store red component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskRedEnabled(rs_program_store ps);
+
+/**
+ * Get program store green component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskGreenEnabled(rs_program_store ps);
+
+/**
+ * Get program store blur component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskBlueEnabled(rs_program_store ps);
+
+/**
+ * Get program store alpha component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskAlphaEnabled(rs_program_store ps);
+
+/**
+ * Get program store blend source function
+ *
+ * @param ps program store to query
+ */
+extern rs_blend_src_func __attribute__((overloadable))
+        rsgProgramStoreGetBlendSrcFunc(rs_program_store ps);
+
+/**
+ * Get program store blend destination function
+ *
+ * @param ps program store to query
+ */
+extern rs_blend_dst_func __attribute__((overloadable))
+    rsgProgramStoreGetBlendDstFunc(rs_program_store ps);
+
+/**
+ * Get program store dither state
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsDitherEnabled(rs_program_store ps);
+
+/**
+ * Get program raster point sprite state
+ *
+ * @param pr program raster to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramRasterIsPointSpriteEnabled(rs_program_raster pr);
+
+/**
+ * Get program raster cull mode
+ *
+ * @param pr program raster to query
+ */
+extern rs_cull_mode __attribute__((overloadable))
+    rsgProgramRasterGetCullMode(rs_program_raster pr);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_PROGRAM_RSH__
+

diff --git a/21.1.2/include/rs_quaternion.rsh b/21.1.2/include/rs_quaternion.rsh
new file mode 100644
index 0000000..4e08d2f
--- /dev/null
+++ b/21.1.2/include/rs_quaternion.rsh

@@ -0,0 +1,253 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_quaternion.rsh
+ *  \brief Quaternion routines
+ *
+ *
+ */
+
+#ifndef __RS_QUATERNION_RSH__
+#define __RS_QUATERNION_RSH__
+
+
+/**
+ * Set the quaternion components
+ * @param w component
+ * @param x component
+ * @param y component
+ * @param z component
+ */
+static void __attribute__((overloadable))
+rsQuaternionSet(rs_quaternion *q, float w, float x, float y, float z) {
+    q->w = w;
+    q->x = x;
+    q->y = y;
+    q->z = z;
+}
+
+/**
+ * Set the quaternion from another quaternion
+ * @param q destination quaternion
+ * @param rhs source quaternion
+ */
+static void __attribute__((overloadable))
+rsQuaternionSet(rs_quaternion *q, const rs_quaternion *rhs) {
+    q->w = rhs->w;
+    q->x = rhs->x;
+    q->y = rhs->y;
+    q->z = rhs->z;
+}
+
+/**
+ * Multiply quaternion by a scalar
+ * @param q quaternion to multiply
+ * @param s scalar
+ */
+static void __attribute__((overloadable))
+rsQuaternionMultiply(rs_quaternion *q, float s) {
+    q->w *= s;
+    q->x *= s;
+    q->y *= s;
+    q->z *= s;
+}
+
+/**
+ * Add two quaternions
+ * @param q destination quaternion to add to
+ * @param rsh right hand side quaternion to add
+ */
+static void
+rsQuaternionAdd(rs_quaternion *q, const rs_quaternion *rhs) {
+    q->w *= rhs->w;
+    q->x *= rhs->x;
+    q->y *= rhs->y;
+    q->z *= rhs->z;
+}
+
+/**
+ * Loads a quaternion that represents a rotation about an arbitrary unit vector
+ * @param q quaternion to set
+ * @param rot angle to rotate by
+ * @param x component of a vector
+ * @param y component of a vector
+ * @param x component of a vector
+ */
+static void
+rsQuaternionLoadRotateUnit(rs_quaternion *q, float rot, float x, float y, float z) {
+    rot *= (float)(M_PI / 180.0f) * 0.5f;
+    float c = cos(rot);
+    float s = sin(rot);
+
+    q->w = c;
+    q->x = x * s;
+    q->y = y * s;
+    q->z = z * s;
+}
+
+/**
+ * Loads a quaternion that represents a rotation about an arbitrary vector
+ * (doesn't have to be unit)
+ * @param q quaternion to set
+ * @param rot angle to rotate by
+ * @param x component of a vector
+ * @param y component of a vector
+ * @param x component of a vector
+ */
+static void
+rsQuaternionLoadRotate(rs_quaternion *q, float rot, float x, float y, float z) {
+    const float len = x*x + y*y + z*z;
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        x *= recipLen;
+        y *= recipLen;
+        z *= recipLen;
+    }
+    rsQuaternionLoadRotateUnit(q, rot, x, y, z);
+}
+
+/**
+ * Conjugates the quaternion
+ * @param q quaternion to conjugate
+ */
+static void
+rsQuaternionConjugate(rs_quaternion *q) {
+    q->x = -q->x;
+    q->y = -q->y;
+    q->z = -q->z;
+}
+
+/**
+ * Dot product of two quaternions
+ * @param q0 first quaternion
+ * @param q1 second quaternion
+ * @return dot product between q0 and q1
+ */
+static float
+rsQuaternionDot(const rs_quaternion *q0, const rs_quaternion *q1) {
+    return q0->w*q1->w + q0->x*q1->x + q0->y*q1->y + q0->z*q1->z;
+}
+
+/**
+ * Normalizes the quaternion
+ * @param q quaternion to normalize
+ */
+static void
+rsQuaternionNormalize(rs_quaternion *q) {
+    const float len = rsQuaternionDot(q, q);
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        rsQuaternionMultiply(q, recipLen);
+    }
+}
+
+/**
+ * Multiply quaternion by another quaternion
+ * @param q destination quaternion
+ * @param rhs right hand side quaternion to multiply by
+ */
+static void __attribute__((overloadable))
+rsQuaternionMultiply(rs_quaternion *q, const rs_quaternion *rhs) {
+    rs_quaternion qtmp;
+    rsQuaternionSet(&qtmp, q);
+
+    q->w = qtmp.w*rhs->w - qtmp.x*rhs->x - qtmp.y*rhs->y - qtmp.z*rhs->z;
+    q->x = qtmp.w*rhs->x + qtmp.x*rhs->w + qtmp.y*rhs->z - qtmp.z*rhs->y;
+    q->y = qtmp.w*rhs->y + qtmp.y*rhs->w + qtmp.z*rhs->x - qtmp.x*rhs->z;
+    q->z = qtmp.w*rhs->z + qtmp.z*rhs->w + qtmp.x*rhs->y - qtmp.y*rhs->x;
+    rsQuaternionNormalize(q);
+}
+
+/**
+ * Performs spherical linear interpolation between two quaternions
+ * @param q result quaternion from interpolation
+ * @param q0 first param
+ * @param q1 second param
+ * @param t how much to interpolate by
+ */
+static void
+rsQuaternionSlerp(rs_quaternion *q, const rs_quaternion *q0, const rs_quaternion *q1, float t) {
+    if (t <= 0.0f) {
+        rsQuaternionSet(q, q0);
+        return;
+    }
+    if (t >= 1.0f) {
+        rsQuaternionSet(q, q1);
+        return;
+    }
+
+    rs_quaternion tempq0, tempq1;
+    rsQuaternionSet(&tempq0, q0);
+    rsQuaternionSet(&tempq1, q1);
+
+    float angle = rsQuaternionDot(q0, q1);
+    if (angle < 0) {
+        rsQuaternionMultiply(&tempq0, -1.0f);
+        angle *= -1.0f;
+    }
+
+    float scale, invScale;
+    if (angle + 1.0f > 0.05f) {
+        if (1.0f - angle >= 0.05f) {
+            float theta = acos(angle);
+            float invSinTheta = 1.0f / sin(theta);
+            scale = sin(theta * (1.0f - t)) * invSinTheta;
+            invScale = sin(theta * t) * invSinTheta;
+        } else {
+            scale = 1.0f - t;
+            invScale = t;
+        }
+    } else {
+        rsQuaternionSet(&tempq1, tempq0.z, -tempq0.y, tempq0.x, -tempq0.w);
+        scale = sin(M_PI * (0.5f - t));
+        invScale = sin(M_PI * t);
+    }
+
+    rsQuaternionSet(q, tempq0.w*scale + tempq1.w*invScale, tempq0.x*scale + tempq1.x*invScale,
+                        tempq0.y*scale + tempq1.y*invScale, tempq0.z*scale + tempq1.z*invScale);
+}
+
+/**
+ * Computes rotation matrix from the normalized quaternion
+ * @param m resulting matrix
+ * @param p normalized quaternion
+ */
+static void rsQuaternionGetMatrixUnit(rs_matrix4x4 *m, const rs_quaternion *q) {
+    float xx = q->x * q->x;
+    float xy = q->x * q->y;
+    float xz = q->x * q->z;
+    float xw = q->x * q->w;
+    float yy = q->y * q->y;
+    float yz = q->y * q->z;
+    float yw = q->y * q->w;
+    float zz = q->z * q->z;
+    float zw = q->z * q->w;
+
+    m->m[0]  = 1.0f - 2.0f * ( yy + zz );
+    m->m[4]  =        2.0f * ( xy - zw );
+    m->m[8]  =        2.0f * ( xz + yw );
+    m->m[1]  =        2.0f * ( xy + zw );
+    m->m[5]  = 1.0f - 2.0f * ( xx + zz );
+    m->m[9]  =        2.0f * ( yz - xw );
+    m->m[2]  =        2.0f * ( xz - yw );
+    m->m[6]  =        2.0f * ( yz + xw );
+    m->m[10] = 1.0f - 2.0f * ( xx + yy );
+    m->m[3]  = m->m[7] = m->m[11] = m->m[12] = m->m[13] = m->m[14] = 0.0f;
+    m->m[15] = 1.0f;
+}
+
+#endif
+

diff --git a/21.1.2/include/rs_sampler.rsh b/21.1.2/include/rs_sampler.rsh
new file mode 100644
index 0000000..2ff426c
--- /dev/null
+++ b/21.1.2/include/rs_sampler.rsh

@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_sampler.rsh
+ *  \brief Sampler routines
+ *
+ *
+ */
+
+#ifndef __RS_SAMPLER_RSH__
+#define __RS_SAMPLER_RSH__
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Get sampler minification value
+ *
+ * @param s sampler to query
+ * @return minification value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMinification(rs_sampler s);
+
+/**
+ * Get sampler magnification value
+ *
+ * @param s sampler to query
+ * @return magnification value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMagnification(rs_sampler s);
+
+/**
+ * Get sampler wrap S value
+ *
+ * @param s sampler to query
+ * @return wrap S value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapS(rs_sampler s);
+
+/**
+ * Get sampler wrap T value
+ *
+ * @param s sampler to query
+ * @return wrap T value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapT(rs_sampler s);
+
+/**
+  Get sampler anisotropy
+ *
+ * @param s sampler to query
+ * @return anisotropy
+ */
+extern float __attribute__((overloadable))
+    rsSamplerGetAnisotropy(rs_sampler s);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_SAMPLER_RSH__
+

diff --git a/21.1.2/include/rs_time.rsh b/21.1.2/include/rs_time.rsh
new file mode 100644
index 0000000..abcb88b
--- /dev/null
+++ b/21.1.2/include/rs_time.rsh

@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_time.rsh
+ *  \brief RenderScript time routines
+ *
+ *  This file contains RenderScript functions relating to time and date
+ *  manipulation.
+ */
+
+#ifndef __RS_TIME_RSH__
+#define __RS_TIME_RSH__
+
+/**
+ * Calendar time interpreted as seconds elapsed since the Epoch (00:00:00 on
+ * January 1, 1970, Coordinated Universal Time (UTC)).
+ */
+#ifndef __LP64__
+typedef int rs_time_t;
+#else
+typedef long rs_time_t;
+#endif
+
+/**
+ * Data structure for broken-down time components.
+ *
+ * tm_sec   - Seconds after the minute. This ranges from 0 to 59, but possibly
+ *            up to 60 for leap seconds.
+ * tm_min   - Minutes after the hour. This ranges from 0 to 59.
+ * tm_hour  - Hours past midnight. This ranges from 0 to 23.
+ * tm_mday  - Day of the month. This ranges from 1 to 31.
+ * tm_mon   - Months since January. This ranges from 0 to 11.
+ * tm_year  - Years since 1900.
+ * tm_wday  - Days since Sunday. This ranges from 0 to 6.
+ * tm_yday  - Days since January 1. This ranges from 0 to 365.
+ * tm_isdst - Flag to indicate whether daylight saving time is in effect. The
+ *            value is positive if it is in effect, zero if it is not, and
+ *            negative if the information is not available.
+ */
+typedef struct {
+    int tm_sec;     ///< seconds
+    int tm_min;     ///< minutes
+    int tm_hour;    ///< hours
+    int tm_mday;    ///< day of the month
+    int tm_mon;     ///< month
+    int tm_year;    ///< year
+    int tm_wday;    ///< day of the week
+    int tm_yday;    ///< day of the year
+    int tm_isdst;   ///< daylight savings time
+} rs_tm;
+
+/**
+ * Returns the number of seconds since the Epoch (00:00:00 UTC, January 1,
+ * 1970). If @p timer is non-NULL, the result is also stored in the memory
+ * pointed to by this variable. If an error occurs, a value of -1 is returned.
+ *
+ * @param timer Location to also store the returned calendar time.
+ *
+ * @return Seconds since the Epoch.
+ */
+extern rs_time_t __attribute__((overloadable))
+    rsTime(rs_time_t *timer);
+
+/**
+ * Converts the time specified by @p timer into broken-down time and stores it
+ * in @p local. This function also returns a pointer to @p local. If @p local
+ * is NULL, this function does nothing and returns NULL.
+ *
+ * @param local Broken-down time.
+ * @param timer Input time as calendar time.
+ *
+ * @return Pointer to broken-down time (same as input @p local).
+ */
+extern rs_tm * __attribute__((overloadable))
+    rsLocaltime(rs_tm *local, const rs_time_t *timer);
+
+/**
+ * Returns the current system clock (uptime) in milliseconds.
+ *
+ * @return Uptime in milliseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeMillis(void);
+
+/**
+ * Returns the current system clock (uptime) in nanoseconds.
+ *
+ * @return Uptime in nanoseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeNanos(void);
+
+/**
+ * Returns the time in seconds since this function was last called in this
+ * script.
+ *
+ * @return Time in seconds.
+ */
+extern float __attribute__((overloadable))
+    rsGetDt(void);
+
+#endif

diff --git a/21.1.2/include/rs_types.rsh b/21.1.2/include/rs_types.rsh
new file mode 100644
index 0000000..f1fc60b
--- /dev/null
+++ b/21.1.2/include/rs_types.rsh

@@ -0,0 +1,651 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_types.rsh
+ *
+ *  Define the standard RenderScript types
+ *
+ *  Integers
+ *  8 bit: char, int8_t
+ *  16 bit: short, int16_t
+ *  32 bit: int, in32_t
+ *  64 bit: long, long long, int64_t
+ *
+ *  Unsigned Integers
+ *  8 bit: uchar, uint8_t
+ *  16 bit: ushort, uint16_t
+ *  32 bit: uint, uint32_t
+ *  64 bit: ulong, uint64_t
+ *
+ *  Floating point
+ *  32 bit: float
+ *  64 bit: double
+ *
+ *  Vectors of length 2, 3, and 4 are supported for all the types above.
+ *
+ */
+
+#ifndef __RS_TYPES_RSH__
+#define __RS_TYPES_RSH__
+
+/* Constants */
+#define M_E         2.718281828459045235360287471352662498f     /* e */
+#define M_LOG2E     1.442695040888963407359924681001892137f     /* log_2 e */
+#define M_LOG10E    0.434294481903251827651128918916605082f     /* log_10 e */
+#define M_LN2       0.693147180559945309417232121458176568f     /* log_e 2 */
+#define M_LN10      2.302585092994045684017991454684364208f     /* log_e 10 */
+#define M_PI        3.141592653589793238462643383279502884f     /* pi */
+#define M_PI_2      1.570796326794896619231321691639751442f     /* pi/2 */
+#define M_PI_4      0.785398163397448309615660845819875721f     /* pi/4 */
+#define M_1_PI      0.318309886183790671537767526745028724f     /* 1/pi */
+#define M_2_PIl     0.636619772367581343075535053490057448f     /* 2/pi */
+#define M_2_SQRTPI  1.128379167095512573896158903121545172f     /* 2/sqrt(pi) */
+#define M_SQRT2     1.414213562373095048801688724209698079f     /* sqrt(2) */
+#define M_SQRT1_2   0.707106781186547524400844362104849039f     /* 1/sqrt(2) */
+
+#include "stdbool.h"
+/**
+ * 8 bit integer type
+ */
+typedef char int8_t;
+/**
+ * 16 bit integer type
+ */
+typedef short int16_t;
+/**
+ * 32 bit integer type
+ */
+typedef int int32_t;
+/**
+ * 64 bit integer type
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+    typedef long int64_t;
+#else
+    typedef long long int64_t;
+#endif
+/**
+ * 8 bit unsigned integer type
+ */
+typedef unsigned char uint8_t;
+/**
+ * 16 bit unsigned integer type
+ */
+typedef unsigned short uint16_t;
+/**
+ * 32 bit unsigned integer type
+ */
+typedef unsigned int uint32_t;
+/**
+ * 64 bit unsigned integer type
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+    typedef unsigned long uint64_t;
+#else
+    typedef unsigned long long uint64_t;
+#endif
+/**
+ * 8 bit unsigned integer type
+ */
+typedef uint8_t uchar;
+/**
+ * 16 bit unsigned integer type
+ */
+typedef uint16_t ushort;
+/**
+ * 32 bit unsigned integer type
+ */
+typedef uint32_t uint;
+/**
+ * Typedef for unsigned long (use for 64-bit unsigned integers)
+ */
+typedef uint64_t ulong;
+/**
+ * Typedef for size_t
+ */
+#ifndef __LP64__
+typedef uint32_t size_t;
+typedef int32_t ssize_t;
+#else
+typedef uint64_t size_t;
+typedef int64_t ssize_t;
+#endif
+
+#ifndef __LP64__
+#define RS_BASE_OBJ typedef struct { const int* const p; } __attribute__((packed, aligned(4)))
+#else
+#define RS_BASE_OBJ typedef struct { const long* const p; const long* const r; const long* const v1; const long* const v2; }
+#endif
+
+/**
+ * \brief Opaque handle to a RenderScript element.
+ *
+ * See: android.renderscript.Element
+ */
+RS_BASE_OBJ rs_element;
+/**
+ * \brief Opaque handle to a RenderScript type.
+ *
+ * See: android.renderscript.Type
+ */
+RS_BASE_OBJ rs_type;
+/**
+ * \brief Opaque handle to a RenderScript allocation.
+ *
+ * See: android.renderscript.Allocation
+ */
+RS_BASE_OBJ rs_allocation;
+/**
+ * \brief Opaque handle to a RenderScript sampler object.
+ *
+ * See: android.renderscript.Sampler
+ */
+RS_BASE_OBJ rs_sampler;
+/**
+ * \brief Opaque handle to a RenderScript script object.
+ *
+ * See: android.renderscript.ScriptC
+ */
+RS_BASE_OBJ rs_script;
+
+#ifndef __LP64__
+/**
+ * \brief Opaque handle to a RenderScript mesh object.
+ *
+ * See: android.renderscript.Mesh
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_mesh;
+/**
+ * \brief Opaque handle to a RenderScript Path object.
+ *
+ * See: android.renderscript.Path
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_path;
+/**
+ * \brief Opaque handle to a RenderScript ProgramFragment object.
+ *
+ * See: android.renderscript.ProgramFragment
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_fragment;
+/**
+ * \brief Opaque handle to a RenderScript ProgramVertex object.
+ *
+ * See: android.renderscript.ProgramVertex
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_vertex;
+/**
+ * \brief Opaque handle to a RenderScript ProgramRaster object.
+ *
+ * See: android.renderscript.ProgramRaster
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_raster;
+/**
+ * \brief Opaque handle to a RenderScript ProgramStore object.
+ *
+ * See: android.renderscript.ProgramStore
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_store;
+/**
+ * \brief Opaque handle to a RenderScript font object.
+ *
+ * See: android.renderscript.Font
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_font;
+#endif // __LP64__
+
+/**
+ * Vector version of the basic float type.
+ * Provides two float fields packed into a single 64 bit field with 64 bit
+ * alignment.
+ */
+typedef float float2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic float type. Provides three float fields packed
+ * into a single 128 bit field with 128 bit alignment.
+ */
+typedef float float3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic float type.
+ * Provides four float fields packed into a single 128 bit field with 128 bit
+ * alignment.
+ */
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic double type. Provides two double fields packed
+ * into a single 128 bit field with 128 bit alignment.
+ */
+typedef double double2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic double type. Provides three double fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef double double3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic double type. Provides four double fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef double double4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic uchar type. Provides two uchar fields packed
+ * into a single 16 bit field with 16 bit alignment.
+ */
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic uchar type. Provides three uchar fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic uchar type. Provides four uchar fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic ushort type. Provides two ushort fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic ushort type. Provides three ushort fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic ushort type. Provides four ushort fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic uint type. Provides two uint fields packed into a
+ * single 64 bit field with 64 bit alignment.
+ */
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic uint type. Provides three uint fields packed into
+ * a single 128 bit field with 128 bit alignment.
+ */
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic uint type. Provides four uint fields packed into
+ * a single 128 bit field with 128 bit alignment.
+ */
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic ulong type. Provides two ulong fields packed into
+ * a single 128 bit field with 128 bit alignment.
+ */
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic ulong type. Provides three ulong fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef ulong ulong3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic ulong type. Provides four ulong fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef ulong ulong4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic char type. Provides two char fields packed into a
+ * single 16 bit field with 16 bit alignment.
+ */
+typedef char char2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic char type. Provides three char fields packed into
+ * a single 32 bit field with 32 bit alignment.
+ */
+typedef char char3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic char type. Provides four char fields packed into
+ * a single 32 bit field with 32 bit alignment.
+ */
+typedef char char4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic short type. Provides two short fields packed into
+ * a single 32 bit field with 32 bit alignment.
+ */
+typedef short short2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic short type. Provides three short fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef short short3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic short type. Provides four short fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef short short4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic int type. Provides two int fields packed into a
+ * single 64 bit field with 64 bit alignment.
+ */
+typedef int int2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic int type. Provides three int fields packed into a
+ * single 128 bit field with 128 bit alignment.
+ */
+typedef int int3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic int type. Provides two four fields packed into a
+ * single 128 bit field with 128 bit alignment.
+ */
+typedef int int4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic long type. Provides two long fields packed into a
+ * single 128 bit field with 128 bit alignment.
+ */
+typedef long long2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic long type. Provides three long fields packed into
+ * a single 256 bit field with 256 bit alignment.
+ */
+typedef long long3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic long type. Provides four long fields packed into
+ * a single 256 bit field with 256 bit alignment.
+ */
+typedef long long4 __attribute__((ext_vector_type(4)));
+
+/**
+ * \brief 4x4 float matrix
+ *
+ * Native holder for RS matrix.  Elements are stored in the array at the
+ * location [row*4 + col]
+ */
+typedef struct {
+    float m[16];
+} rs_matrix4x4;
+/**
+ * \brief 3x3 float matrix
+ *
+ * Native holder for RS matrix.  Elements are stored in the array at the
+ * location [row*3 + col]
+ */
+typedef struct {
+    float m[9];
+} rs_matrix3x3;
+/**
+ * \brief 2x2 float matrix
+ *
+ * Native holder for RS matrix.  Elements are stored in the array at the
+ * location [row*2 + col]
+ */
+typedef struct {
+    float m[4];
+} rs_matrix2x2;
+
+/**
+ * quaternion type for use with the quaternion functions
+ */
+typedef float4 rs_quaternion;
+
+#define RS_PACKED __attribute__((packed, aligned(4)))
+#define NULL ((void *)0)
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * \brief Enum for selecting cube map faces
+ */
+typedef enum {
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X = 0,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_X = 1,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Y = 2,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Y = 3,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Z = 4,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Z = 5
+} rs_allocation_cubemap_face;
+
+/**
+ * \brief Bitfield to specify the usage types for an allocation.
+ *
+ * These values are ORed together to specify which usages or memory spaces are
+ * relevant to an allocation or an operation on an allocation.
+ */
+typedef enum {
+    RS_ALLOCATION_USAGE_SCRIPT = 0x0001,
+    RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002,
+    RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004,
+    RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008,
+    RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010
+} rs_allocation_usage_type;
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#ifndef __LP64__
+/**
+ * Describes the way mesh vertex data is interpreted when rendering
+ *
+ **/
+typedef enum {
+    /**
+    * Vertex data will be rendered as a series of points
+    */
+    RS_PRIMITIVE_POINT              = 0,
+    /**
+    * Vertex pairs will be rendered as lines
+    */
+    RS_PRIMITIVE_LINE               = 1,
+    /**
+    * Vertex data will be rendered as a connected line strip
+    */
+    RS_PRIMITIVE_LINE_STRIP         = 2,
+    /**
+    * Vertices will be rendered as individual triangles
+    */
+    RS_PRIMITIVE_TRIANGLE           = 3,
+    /**
+    * Vertices will be rendered as a connected triangle strip
+    * defined by the first three vertices with each additional
+    * triangle defined by a new vertex
+    */
+    RS_PRIMITIVE_TRIANGLE_STRIP     = 4,
+    /**
+    * Vertices will be rendered as a sequence of triangles that all
+    * share first vertex as the origin
+    */
+    RS_PRIMITIVE_TRIANGLE_FAN       = 5,
+
+    /**
+    * Invalid primitive
+    */
+    RS_PRIMITIVE_INVALID            = 100,
+} rs_primitive;
+#endif // __LP64__
+
+/**
+ * \brief Enumeration for possible element data types
+ *
+ * DataType represents the basic type information for a basic element.  The
+ * naming convention follows.  For numeric types it is FLOAT,
+ * SIGNED, or UNSIGNED followed by the _BITS where BITS is the
+ * size of the data.  BOOLEAN is a true / false (1,0)
+ * represented in an 8 bit container.  The UNSIGNED variants
+ * with multiple bit definitions are for packed graphical data
+ * formats and represent vectors with per vector member sizes
+ * which are treated as a single unit for packing and alignment
+ * purposes.
+ *
+ * MATRIX the three matrix types contain FLOAT_32 elements and are treated
+ * as 32 bits for alignment purposes.
+ *
+ * RS_* objects.  32 bit opaque handles.
+ */
+typedef enum {
+    RS_TYPE_NONE             = 0,
+    RS_TYPE_FLOAT_32         = 2,
+    RS_TYPE_FLOAT_64         = 3,
+    RS_TYPE_SIGNED_8         = 4,
+    RS_TYPE_SIGNED_16        = 5,
+    RS_TYPE_SIGNED_32        = 6,
+    RS_TYPE_SIGNED_64        = 7,
+    RS_TYPE_UNSIGNED_8       = 8,
+    RS_TYPE_UNSIGNED_16      = 9,
+    RS_TYPE_UNSIGNED_32      = 10,
+    RS_TYPE_UNSIGNED_64      = 11,
+
+    RS_TYPE_BOOLEAN          = 12,
+
+    RS_TYPE_UNSIGNED_5_6_5   = 13,
+    RS_TYPE_UNSIGNED_5_5_5_1 = 14,
+    RS_TYPE_UNSIGNED_4_4_4_4 = 15,
+
+    RS_TYPE_MATRIX_4X4       = 16,
+    RS_TYPE_MATRIX_3X3       = 17,
+    RS_TYPE_MATRIX_2X2       = 18,
+
+    RS_TYPE_ELEMENT          = 1000,
+    RS_TYPE_TYPE             = 1001,
+    RS_TYPE_ALLOCATION       = 1002,
+    RS_TYPE_SAMPLER          = 1003,
+    RS_TYPE_SCRIPT           = 1004,
+    RS_TYPE_MESH             = 1005,
+    RS_TYPE_PROGRAM_FRAGMENT = 1006,
+    RS_TYPE_PROGRAM_VERTEX   = 1007,
+    RS_TYPE_PROGRAM_RASTER   = 1008,
+    RS_TYPE_PROGRAM_STORE    = 1009,
+    RS_TYPE_FONT             = 1010,
+
+    RS_TYPE_INVALID          = 10000,
+} rs_data_type;
+
+/**
+ * \brief Enumeration for possible element data kind
+ *
+ * The special interpretation of the data if required.  This is primarly
+ * useful for graphical data.  USER indicates no special interpretation is
+ * expected.  PIXEL is used in conjunction with the standard data types for
+ * representing texture formats.
+ */
+typedef enum {
+    RS_KIND_USER         = 0,
+
+    RS_KIND_PIXEL_L      = 7,
+    RS_KIND_PIXEL_A      = 8,
+    RS_KIND_PIXEL_LA     = 9,
+    RS_KIND_PIXEL_RGB    = 10,
+    RS_KIND_PIXEL_RGBA   = 11,
+    RS_KIND_PIXEL_DEPTH  = 12,
+    RS_KIND_PIXEL_YUV    = 13,
+
+    RS_KIND_INVALID      = 100,
+} rs_data_kind;
+
+#ifndef __LP64__
+typedef enum {
+    /**
+    * Always drawn
+    */
+    RS_DEPTH_FUNC_ALWAYS        = 0,
+    /**
+    * Drawn if the incoming depth value is less than that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_LESS          = 1,
+    /**
+    * Drawn if the incoming depth value is less or equal to that in
+    * the depth buffer
+    */
+    RS_DEPTH_FUNC_LEQUAL        = 2,
+    /**
+    * Drawn if the incoming depth value is greater than that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_GREATER       = 3,
+    /**
+    * Drawn if the incoming depth value is greater or equal to that
+    * in the depth buffer
+    */
+    RS_DEPTH_FUNC_GEQUAL        = 4,
+    /**
+    * Drawn if the incoming depth value is equal to that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_EQUAL         = 5,
+    /**
+    * Drawn if the incoming depth value is not equal to that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_NOTEQUAL      = 6,
+    /**
+    * Invalid depth function
+    */
+    RS_DEPTH_FUNC_INVALID       = 100,
+} rs_depth_func;
+
+typedef enum {
+    RS_BLEND_SRC_ZERO                   = 0,
+    RS_BLEND_SRC_ONE                    = 1,
+    RS_BLEND_SRC_DST_COLOR              = 2,
+    RS_BLEND_SRC_ONE_MINUS_DST_COLOR    = 3,
+    RS_BLEND_SRC_SRC_ALPHA              = 4,
+    RS_BLEND_SRC_ONE_MINUS_SRC_ALPHA    = 5,
+    RS_BLEND_SRC_DST_ALPHA              = 6,
+    RS_BLEND_SRC_ONE_MINUS_DST_ALPHA    = 7,
+    RS_BLEND_SRC_SRC_ALPHA_SATURATE     = 8,
+
+    RS_BLEND_SRC_INVALID                = 100,
+} rs_blend_src_func;
+
+typedef enum {
+    RS_BLEND_DST_ZERO                   = 0,
+    RS_BLEND_DST_ONE                    = 1,
+    RS_BLEND_DST_SRC_COLOR              = 2,
+    RS_BLEND_DST_ONE_MINUS_SRC_COLOR    = 3,
+    RS_BLEND_DST_SRC_ALPHA              = 4,
+    RS_BLEND_DST_ONE_MINUS_SRC_ALPHA    = 5,
+    RS_BLEND_DST_DST_ALPHA              = 6,
+    RS_BLEND_DST_ONE_MINUS_DST_ALPHA    = 7,
+
+    RS_BLEND_DST_INVALID                = 100,
+} rs_blend_dst_func;
+
+typedef enum {
+    RS_CULL_BACK     = 0,
+    RS_CULL_FRONT    = 1,
+    RS_CULL_NONE     = 2,
+
+    RS_CULL_INVALID  = 100,
+} rs_cull_mode;
+#endif //__LP64__
+
+typedef enum {
+    RS_SAMPLER_NEAREST              = 0,
+    RS_SAMPLER_LINEAR               = 1,
+    RS_SAMPLER_LINEAR_MIP_LINEAR    = 2,
+    RS_SAMPLER_WRAP                 = 3,
+    RS_SAMPLER_CLAMP                = 4,
+    RS_SAMPLER_LINEAR_MIP_NEAREST   = 5,
+    RS_SAMPLER_MIRRORED_REPEAT      = 6,
+
+    RS_SAMPLER_INVALID              = 100,
+} rs_sampler_value;
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_TYPES_RSH__

diff --git a/21.1.2/lib/libLLVM.so b/21.1.2/lib/libLLVM.so
new file mode 100755
index 0000000..c6e927f
--- /dev/null
+++ b/21.1.2/lib/libLLVM.so
Binary files differ

diff --git a/21.1.2/lib/libc++.so b/21.1.2/lib/libc++.so
new file mode 100755
index 0000000..723d71c
--- /dev/null
+++ b/21.1.2/lib/libc++.so
Binary files differ

diff --git a/21.1.2/lib/libclang.so b/21.1.2/lib/libclang.so
new file mode 100755
index 0000000..ce91323
--- /dev/null
+++ b/21.1.2/lib/libclang.so
Binary files differ

diff --git a/22.0.1/bin/llvm-rs-cc b/22.0.1/bin/llvm-rs-cc
new file mode 100755
index 0000000..030bbc9
--- /dev/null
+++ b/22.0.1/bin/llvm-rs-cc
Binary files differ

diff --git a/22.0.1/clang-include/CMakeLists.txt b/22.0.1/clang-include/CMakeLists.txt
new file mode 100644
index 0000000..edee7d7
--- /dev/null
+++ b/22.0.1/clang-include/CMakeLists.txt

@@ -0,0 +1,81 @@
+set(files
+  altivec.h
+  ammintrin.h
+  arm_acle.h
+  avxintrin.h
+  avx2intrin.h
+  bmiintrin.h
+  bmi2intrin.h
+  emmintrin.h
+  f16cintrin.h
+  float.h
+  fma4intrin.h
+  fmaintrin.h
+  ia32intrin.h
+  immintrin.h
+  iso646.h
+  Intrin.h
+  limits.h
+  lzcntintrin.h
+  mm3dnow.h
+  mmintrin.h
+  mm_malloc.h
+  nmmintrin.h
+  pmmintrin.h
+  popcntintrin.h
+  prfchwintrin.h
+  rdseedintrin.h
+  rtmintrin.h
+  shaintrin.h
+  smmintrin.h
+  stdalign.h
+  stdarg.h
+  stdbool.h
+  stddef.h
+  stdint.h
+  stdnoreturn.h
+  tbmintrin.h
+  tgmath.h
+  tmmintrin.h
+  varargs.h
+  wmmintrin.h
+  __wmmintrin_aes.h
+  __wmmintrin_pclmul.h
+  x86intrin.h
+  xmmintrin.h
+  xopintrin.h
+  cpuid.h
+  unwind.h
+  module.modulemap
+  )
+
+set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
+
+# Generate arm_neon.h
+clang_tablegen(arm_neon.h -gen-arm-neon
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+
+set(out_files)
+foreach( f ${files} )
+  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
+  set( dst ${output_dir}/${f} )
+  add_custom_command(OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+    COMMENT "Copying clang's ${f}...")
+  list(APPEND out_files ${dst})
+endforeach( f )
+
+add_custom_command(OUTPUT ${output_dir}/arm_neon.h 
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
+  COMMENT "Copying clang's arm_neon.h...")
+list(APPEND out_files ${output_dir}/arm_neon.h)
+
+add_custom_target(clang-headers ALL DEPENDS ${out_files})
+set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
+
+install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)

diff --git a/22.0.1/clang-include/Intrin.h b/22.0.1/clang-include/Intrin.h
new file mode 100644
index 0000000..13e105e
--- /dev/null
+++ b/22.0.1/clang-include/Intrin.h

@@ -0,0 +1,1014 @@
+/* ===-------- Intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <Intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+__m64 _m_from_int(int _l);
+void _m_prefetch(void *);
+float _m_to_float(__m64);
+int _m_to_int(__m64 _M);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+void __debugbreak(void);
+__int64 __emul(int, int);
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+static __inline__
+unsigned int __popcnt(unsigned int);
+static __inline__
+unsigned short __popcnt16(unsigned short);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned long __readfsdword(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+unsigned int _andn_u32(unsigned int, unsigned int);
+unsigned int _bextr_u32(unsigned int, unsigned int, unsigned int);
+unsigned int _bextri_u32(unsigned int, unsigned int);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+unsigned int _blcfill_u32(unsigned int);
+unsigned int _blci_u32(unsigned int);
+unsigned int _blcic_u32(unsigned int);
+unsigned int _blcmsk_u32(unsigned int);
+unsigned int _blcs_u32(unsigned int);
+unsigned int _blsfill_u32(unsigned int);
+unsigned int _blsi_u32(unsigned int);
+unsigned int _blsic_u32(unsigned int);
+unsigned int _blsmsk_u32(unsigned int);
+unsigned int _blsr_u32(unsigned int);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned long __cdecl _byteswap_ulong(unsigned long);
+unsigned short __cdecl _byteswap_ushort(unsigned short);
+unsigned _bzhi_u32(unsigned int, unsigned int);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+void __cdecl _fxrstor(void const *);
+void __cdecl _fxsave(void *);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+static __inline__
+long _InterlockedAnd(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedAnd16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedAnd8(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+static __inline__
+long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
+                                         long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+static __inline__
+short _InterlockedCompareExchange16(short volatile *_Destination,
+                                    short _Exchange, short _Comparand);
+static __inline__
+__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
+                                      __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+static __inline__
+char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
+                                  char _Comparand);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+static __inline__
+long __cdecl _InterlockedDecrement(long volatile *_Addend);
+static __inline__
+short _InterlockedDecrement16(short volatile *_Addend);
+long _InterlockedExchange(long volatile *_Target, long _Value);
+static __inline__
+short _InterlockedExchange16(short volatile *_Target, short _Value);
+static __inline__
+char _InterlockedExchange8(char volatile *_Target, char _Value);
+static __inline__
+long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+static __inline__
+short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+static __inline__
+char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
+static __inline__
+long __cdecl _InterlockedIncrement(long volatile *_Addend);
+static __inline__
+short _InterlockedIncrement16(short volatile *_Addend);
+static __inline__
+long _InterlockedOr(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedOr16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedOr8(char volatile *_Value, char _Mask);
+static __inline__
+long _InterlockedXor(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedXor16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedXor8(char volatile *_Value, char _Mask);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__
+unsigned long __cdecl _lrotl(unsigned long, int);
+static __inline__
+unsigned long __cdecl _lrotr(unsigned long, int);
+static __inline__
+unsigned int _lzcnt_u32(unsigned int);
+static __inline__
+void _ReadBarrier(void);
+static __inline__
+void _ReadWriteBarrier(void);
+static __inline__
+void *_ReturnAddress(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+int __cdecl _rdrand16_step(unsigned short *);
+int __cdecl _rdrand32_step(unsigned int *);
+static __inline__
+unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
+static __inline__
+unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+unsigned int _t1mskc_u32(unsigned int);
+unsigned int _tzcnt_u32(unsigned int);
+unsigned int _tzmsk_u32(unsigned int);
+static __inline__
+void _WriteBarrier(void);
+void _xabort(const unsigned int imm);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xrstor(void const *, unsigned __int64);
+void __cdecl _xsave(void *, unsigned __int64);
+void __cdecl _xsaveopt(void *, unsigned __int64);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+unsigned char _xtest(void);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __popcnt64(unsigned __int64);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+unsigned __int64 _andn_u64(unsigned __int64, unsigned __int64);
+unsigned __int64 _bextr_u64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 _bextri_u64(unsigned __int64, unsigned int);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+unsigned __int64 _blcfill_u64(unsigned __int64);
+unsigned __int64 _blci_u64(unsigned __int64);
+unsigned __int64 _blcic_u64(unsigned __int64);
+unsigned __int64 _blcmsk_u64(unsigned __int64);
+unsigned __int64 _blcs_u64(unsigned __int64);
+unsigned __int64 _blsfill_u64(unsigned __int64);
+unsigned __int64 _blsi_u64(unsigned __int64);
+unsigned __int64 _blsic_u64(unsigned __int64);
+unsigned __int64 _blsmsk_u64(unsigned __int64);
+unsigned __int64 _blsr_u64(unsigned __int64);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned __int64 _bzhi_u64(unsigned __int64, unsigned int);
+void __cdecl _fxrstor64(void const *);
+void __cdecl _fxsave64(void *);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
+                                         void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+static __inline__
+unsigned __int64 _lzcnt_u64(unsigned __int64);
+__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
+                __int64 *_HighProduct);
+unsigned int __cdecl _readfsbase_u32(void);
+unsigned __int64 __cdecl _readfsbase_u64(void);
+unsigned int __cdecl _readgsbase_u32(void);
+unsigned __int64 __cdecl _readgsbase_u64(void);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmpex(jmp_buf);
+#endif
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 shrx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _tzcnt_u64(unsigned __int64);
+unsigned __int64 _tzmsk_u64(unsigned __int64);
+unsigned __int64 _umul128(unsigned __int64 _Multiplier,
+                          unsigned __int64 _Multiplicand,
+                          unsigned __int64 *_HighProduct);
+void __cdecl _writefsbase_u32(unsigned int);
+void _cdecl _writefsbase_u64(unsigned __int64);
+void __cdecl _writegsbase_u32(unsigned int);
+void __cdecl _writegsbase_u64(unsigned __int64);
+void __cdecl _xrstor64(void const *, unsigned __int64);
+void __cdecl _xsave64(void *, unsigned __int64);
+void __cdecl _xsaveopt64(void *, unsigned __int64);
+
+#endif /* __x86_64__ */
+
+/*----------------------------------------------------------------------------*\
+|* Bit Twiddling
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_rotl8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_rotr8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_rotl16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_rotr16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_rotl(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_rotr(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+_lrotl(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+_lrotr(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_rotl64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_rotr64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 31 - __builtin_clzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_lzcnt_u32(unsigned int a) {
+  if (!a)
+    return 32;
+  return __builtin_clzl(a);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__popcnt16(unsigned short value) {
+  return __builtin_popcount((int)value);
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__popcnt(unsigned int value) {
+  return __builtin_popcount(value);
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittest(long const *a, long b) {
+  return (*a >> b) & 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandcomplement(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a ^ (1 << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandreset(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a & ~(1 << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandset(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a | (1 << b);
+  return x;
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_interlockedbittestandset(long volatile *__BitBase, long __BitPos) {
+  unsigned char __Res;
+  __asm__ ("xor %0, %0\n"
+           "lock bts %2, %1\n"
+           "setc %0\n"
+           : "=r" (__Res), "+m"(*__BitBase)
+           : "Ir"(__BitPos));
+  return __Res;
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzll(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 63 - __builtin_clzll(_Mask);
+  return 1;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_lzcnt_u64(unsigned __int64 a) {
+  if (!a)
+    return 64;
+  return __builtin_clzll(a);
+}
+static __inline__
+unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+ __popcnt64(unsigned __int64 value) {
+  return __builtin_popcountll(value);
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittest64(__int64 const *a, __int64 b) {
+  return (*a >> b) & 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandcomplement64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a ^ (1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandreset64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a & ~(1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandset64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a | (1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_interlockedbittestandset64(__int64 volatile *__BitBase, __int64 __BitPos) {
+  unsigned char __Res;
+  __asm__ ("xor %0, %0\n"
+           "lock bts %2, %1\n"
+           "setc %0\n"
+           : "=r" (__Res), "+m"(*__BitBase)
+           : "Ir"(__BitPos));
+  return __Res;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Sub
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedIncrement16(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedIncrement64(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedDecrement16(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedDecrement64(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd8(char volatile *_Value, char _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd16(short volatile *_Value, short _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd(long volatile *_Value, long _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr8(char volatile *_Value, char _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr16(short volatile *_Value, short _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr(long volatile *_Value, long _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor8(char volatile *_Value, char _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor16(short volatile *_Value, short _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor(long volatile *_Value, long _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange8(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange16(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange8(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange16(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange64(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+/*----------------------------------------------------------------------------*\
+|* Barriers
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((deprecated("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__faststorefence(void) {
+  __asm__ volatile("lock orq $0, (%%rsp)" : : : "memory");
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__readfsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsh" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
+  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosh" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+_AddressOfReturnAddress(void) {
+  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
+}
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+_ReturnAddress(void) {
+  return __builtin_return_address(0);
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __attribute__((__always_inline__, __nodebug__))
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __attribute__((always_inline, __nodebug__))
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __attribute__((always_inline, __nodebug__))
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */

diff --git a/22.0.1/clang-include/LICENSE.TXT b/22.0.1/clang-include/LICENSE.TXT
new file mode 100644
index 0000000..3b1153d
--- /dev/null
+++ b/22.0.1/clang-include/LICENSE.TXT

@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2014 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+

diff --git a/22.0.1/clang-include/altivec.h b/22.0.1/clang-include/altivec.h
new file mode 100644
index 0000000..7a4a774
--- /dev/null
+++ b/22.0.1/clang-include/altivec.h

@@ -0,0 +1,12389 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ     0
+#define __CR6_EQ_REV 1
+#define __CR6_LT     2
+#define __CR6_LT_REV 3
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+static vector signed char __ATTRS_o_ai
+vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a,
+         vector unsigned char __b,
+         vector unsigned char __c);
+
+static vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c);
+
+static vector short __ATTRS_o_ai
+vec_perm(vector short __a, vector short __b, vector unsigned char __c);
+
+static vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned char __c);
+
+static vector bool short __ATTRS_o_ai
+vec_perm(vector bool short __a, vector bool short __b, vector unsigned char __c);
+
+static vector pixel __ATTRS_o_ai
+vec_perm(vector pixel __a, vector pixel __b, vector unsigned char __c);
+
+static vector int __ATTRS_o_ai
+vec_perm(vector int __a, vector int __b, vector unsigned char __c);
+
+static vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c);
+
+static vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
+
+static vector float __ATTRS_o_ai
+vec_perm(vector float __a, vector float __b, vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi  vec_abs
+#define __builtin_altivec_abs_v4si  vec_abs
+
+static vector signed char __ATTRS_o_ai
+vec_abs(vector signed char __a)
+{
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_abs(vector signed short __a)
+{
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static vector signed int __ATTRS_o_ai
+vec_abs(vector signed int __a)
+{
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+static vector float __ATTRS_o_ai
+vec_abs(vector float __a)
+{
+  vector unsigned int __res = (vector unsigned int)__a
+                            & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+}
+
+/* vec_abss */
+
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi  vec_abss
+#define __builtin_altivec_abss_v4si  vec_abss
+
+static vector signed char __ATTRS_o_ai
+vec_abss(vector signed char __a)
+{
+  return __builtin_altivec_vmaxsb
+           (__a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static vector signed short __ATTRS_o_ai
+vec_abss(vector signed short __a)
+{
+  return __builtin_altivec_vmaxsh
+           (__a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static vector signed int __ATTRS_o_ai
+vec_abss(vector signed int __a)
+{
+  return __builtin_altivec_vmaxsw
+           (__a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_add */
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector signed char __b)
+{
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector bool char __b)
+{
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector bool char __b)
+{
+  return __a + (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector short __a, vector short __b)
+{
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector bool short __a, vector short __b)
+{
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector short __a, vector bool short __b)
+{
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector bool short __b)
+{
+  return __a + (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector int __a, vector int __b)
+{
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector bool int __a, vector int __b)
+{
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector int __a, vector bool int __b)
+{
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector bool int __b)
+{
+  return __a + (vector unsigned int)__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_add(vector float __a, vector float __b)
+{
+  return __a + __b;
+}
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector signed char __b)
+{
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector bool char __b)
+{
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector bool char __b)
+{
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector short __a, vector short __b)
+{
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector short __b)
+{
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector short __a, vector bool short __b)
+{
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector bool short __b)
+{
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector int __a, vector int __b)
+{
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector int __b)
+{
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector int __a, vector bool int __b)
+{
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector bool int __b)
+{
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp  vec_vaddfp
+
+static vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b)
+{
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_addc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_vaddcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector signed char __b)
+{
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector bool char __b)
+{
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector bool char __b)
+{
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_and(vector bool char __a, vector bool char __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector short __a, vector short __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector short __a, vector bool short __b)
+{
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector bool short __b)
+{
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_and(vector bool short __a, vector bool short __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector int __a, vector int __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector int __a, vector bool int __b)
+{
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector bool int __b)
+{
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_and(vector bool int __a, vector bool int __b)
+{
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vand */
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector signed char __b)
+{
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector bool char __b)
+{
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector bool char __b)
+{
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector bool char __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector short __a, vector short __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector short __a, vector bool short __b)
+{
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector bool short __b)
+{
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector bool short __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector int __a, vector int __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector int __a, vector bool int __b)
+{
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector bool int __b)
+{
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector bool int __b)
+{
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector signed char __b)
+{
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector bool char __b)
+{
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector bool char __b)
+{
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector bool char __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector short __a, vector short __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector short __a, vector bool short __b)
+{
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector bool short __b)
+{
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector bool short __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector int __a, vector int __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector int __a, vector bool int __b)
+{
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector bool int __b)
+{
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector bool int __b)
+{
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vandc */
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector signed char __b)
+{
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector bool char __b)
+{
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector bool char __b)
+{
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector bool char __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector short __a, vector short __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector short __a, vector bool short __b)
+{
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector bool short __b)
+{
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector bool short __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector int __a, vector int __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector int __a, vector bool int __b)
+{
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector bool int __b)
+{
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector bool int __b)
+{
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_avg */
+
+static vector signed char __ATTRS_o_ai
+vec_avg(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_avg(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_avg(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_avg(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_avg(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_avg(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static vector float __attribute__((__always_inline__))
+vec_ceil(vector float __a)
+{
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_vrfip */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a)
+{
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpeq(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)
+    __builtin_altivec_vcmpequb((vector char)__a, (vector char)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)
+    __builtin_altivec_vcmpequb((vector char)__a, (vector char)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpeq(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)
+    __builtin_altivec_vcmpequh((vector short)__a, (vector short)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)
+    __builtin_altivec_vcmpequw((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+}
+
+/* vec_cmpge */
+
+static vector bool int __attribute__((__always_inline__))
+vec_cmpge(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgefp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_cmpgt */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpgt(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpgt(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static vector bool int __attribute__((__always_inline__))
+vec_cmple(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__b, __a);
+}
+
+/* vec_cmplt */
+
+static vector bool char __ATTRS_o_ai
+vec_cmplt(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmplt(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmplt(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmplt(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__b, __a);
+}
+
+/* vec_ctf */
+
+static vector float __ATTRS_o_ai
+vec_ctf(vector int __a, int __b)
+{
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ctf(vector unsigned int __a, int __b)
+{
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_vcfsx */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b)
+{
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b)
+{
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static vector int __attribute__((__always_inline__))
+vec_cts(vector float __a, int __b)
+{
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_vctsxs */
+
+static vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b)
+{
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_ctu(vector float __a, int __b)
+{
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_vctuxs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b)
+{
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_dss */
+
+static void __attribute__((__always_inline__))
+vec_dss(int __a)
+{
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static void __attribute__((__always_inline__))
+vec_dssall(void)
+{
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+
+static void __attribute__((__always_inline__))
+vec_dst(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dst(__a, __b, __c);
+}
+
+/* vec_dstst */
+
+static void __attribute__((__always_inline__))
+vec_dstst(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dstst(__a, __b, __c);
+}
+
+/* vec_dststt */
+
+static void __attribute__((__always_inline__))
+vec_dststt(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dststt(__a, __b, __c);
+}
+
+/* vec_dstt */
+
+static void __attribute__((__always_inline__))
+vec_dstt(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dstt(__a, __b, __c);
+}
+
+/* vec_expte */
+
+static vector float __attribute__((__always_inline__))
+vec_expte(vector float __a)
+{
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a)
+{
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static vector float __attribute__((__always_inline__))
+vec_floor(vector float __a)
+{
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_vrfim */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a)
+{
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static vector signed char __ATTRS_o_ai
+vec_ld(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_ld(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_ld(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ld(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ld(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_ld(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_ld(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ld(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ld(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_ld(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ld(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ld(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvx(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvx(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvx(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvx(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvx(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvx(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvx(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvx(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvx(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvx(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static vector signed char __ATTRS_o_ai
+vec_lde(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lde(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lde(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lde(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lde(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lde(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lde(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvebx(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvebx(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static vector short __ATTRS_o_ai
+vec_lvehx(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvehx(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static vector int __ATTRS_o_ai
+vec_lvewx(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvewx(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvewx(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_ldl(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ldl(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ldl(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_ldl(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_ldl(int __a, const vector pixel *__b)
+{
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ldl(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ldl(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_ldl(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ldl(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ldl(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvxl(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvxl(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvxl(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvxl(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvxl(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvxl(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvxl(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static vector float __attribute__((__always_inline__))
+vec_loge(vector float __a)
+{
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a)
+{
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const signed char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const float *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+
+/* vec_lvsr */
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const signed char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const float *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+
+/* vec_madd */
+
+static vector float __attribute__((__always_inline__))
+vec_madd(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_vmaddfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b, vector signed short __c)
+{
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a,
+              vector signed short __b,
+              vector signed short __c)
+{
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_max */
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_max(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vmaxfp(__a, __b);
+}
+
+/* vec_vmaxsb */
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vmaxfp(__a, __b);
+}
+
+/* vec_mergeh */
+
+static vector signed char __ATTRS_o_ai
+vec_mergeh(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_mergeh(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_mergeh(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector short __ATTRS_o_ai
+vec_mergeh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_mergeh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_mergeh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector int __ATTRS_o_ai
+vec_mergeh(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergeh(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_mergeh(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai
+vec_mergeh(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static vector signed char __ATTRS_o_ai
+vec_vmrghb(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmrghb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vmrghb(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static vector short __ATTRS_o_ai
+vec_vmrghh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vmrghh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vmrghh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static vector int __ATTRS_o_ai
+vec_vmrghw(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmrghw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vmrghw(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai
+vec_vmrghw(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static vector signed char __ATTRS_o_ai
+vec_mergel(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_mergel(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_mergel(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector short __ATTRS_o_ai
+vec_mergel(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_mergel(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_mergel(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector int __ATTRS_o_ai
+vec_mergel(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergel(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_mergel(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai
+vec_mergel(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static vector signed char __ATTRS_o_ai
+vec_vmrglb(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmrglb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vmrglb(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static vector short __ATTRS_o_ai
+vec_vmrglh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vmrglh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vmrglh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static vector int __ATTRS_o_ai
+vec_vmrglw(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmrglw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vmrglw(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai
+vec_vmrglw(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+/* vec_mfvscr */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void)
+{
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_min(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vminfp(__a, __b);
+}
+
+/* vec_vminsb */
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vminfp(__a, __b);
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector short __a, vector short __b, vector short __c)
+{
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector short __a, vector unsigned short __b, vector unsigned short __c)
+{
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a, vector short __b, vector short __c)
+{
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned short __c)
+{
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector short __a, vector short __b, vector short __c)
+{
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector short __a, vector unsigned short __b, vector unsigned short __c)
+{
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector short __b, vector short __c)
+{
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a,
+              vector unsigned short __b,
+              vector unsigned short __c)
+{
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c)
+{
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c)
+{
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static vector int __ATTRS_o_ai
+vec_msum(vector signed char __a, vector unsigned char __b, vector int __c)
+{
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned char __a, vector unsigned char __b, vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_msum(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c)
+{
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a,
+             vector unsigned char __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a,
+             vector unsigned short __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static vector int __ATTRS_o_ai
+vec_msums(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msums(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a,
+             vector unsigned short __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector signed char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector pixel __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector float __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static vector short __ATTRS_o_ai
+vec_mule(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mule(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_mule(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mule(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_vmulesb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static vector short __ATTRS_o_ai
+vec_mulo(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mulo(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_mulo(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mulo(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/* vec_vmulosb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/* vec_nmsub */
+
+static vector float __attribute__((__always_inline__))
+vec_nmsub(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_vnmsubfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static vector signed char __ATTRS_o_ai
+vec_nor(vector signed char __a, vector signed char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_nor(vector unsigned char __a, vector unsigned char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_nor(vector bool char __a, vector bool char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_nor(vector short __a, vector short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_nor(vector unsigned short __a, vector unsigned short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_nor(vector bool short __a, vector bool short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_nor(vector int __a, vector int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_nor(vector unsigned int __a, vector unsigned int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_nor(vector bool int __a, vector bool int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_nor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+/* vec_vnor */
+
+static vector signed char __ATTRS_o_ai
+vec_vnor(vector signed char __a, vector signed char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vnor(vector unsigned char __a, vector unsigned char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vnor(vector bool char __a, vector bool char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vnor(vector short __a, vector short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vnor(vector unsigned short __a, vector unsigned short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vnor(vector bool short __a, vector bool short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vnor(vector int __a, vector int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vnor(vector unsigned int __a, vector unsigned int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vnor(vector bool int __a, vector bool int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vnor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector signed char __b)
+{
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector bool char __b)
+{
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector bool char __b)
+{
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_or(vector bool char __a, vector bool char __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector short __a, vector short __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector bool short __a, vector short __b)
+{
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector short __a, vector bool short __b)
+{
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector bool short __b)
+{
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_or(vector bool short __a, vector bool short __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector int __a, vector int __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector bool int __a, vector int __b)
+{
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector int __a, vector bool int __b)
+{
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector bool int __b)
+{
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_or(vector bool int __a, vector bool int __b)
+{
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vor */
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector signed char __b)
+{
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector bool char __b)
+{
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector bool char __b)
+{
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector bool char __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector short __a, vector short __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector short __a, vector bool short __b)
+{
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector bool short __b)
+{
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector bool short __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector int __a, vector int __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector int __a, vector bool int __b)
+{
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector bool int __b)
+{
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector bool int __b)
+{
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static vector signed char __ATTRS_o_ai
+vec_pack(vector signed short __a, vector signed short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_pack(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_pack(vector bool short __a, vector bool short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_pack(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_pack(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_pack(vector bool int __a, vector bool int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static vector signed char __ATTRS_o_ai
+vec_vpkuhum(vector signed short __a, vector signed short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vpkuhum(vector bool short __a, vector bool short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static vector short __ATTRS_o_ai
+vec_vpkuwum(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkuwum(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vpkuwum(vector bool int __a, vector bool int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_packpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static vector signed char __ATTRS_o_ai
+vec_packs(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_packs(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector signed short __ATTRS_o_ai
+vec_packs(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packs(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpkshss */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpkuhus */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswss */
+
+static vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static vector unsigned char __ATTRS_o_ai
+vec_packsu(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_packsu(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packsu(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packsu(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpkshus */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+vector signed char __ATTRS_o_ai
+vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a,
+         vector unsigned char __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector short __ATTRS_o_ai
+vec_perm(vector short __a, vector short __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector bool short __ATTRS_o_ai
+vec_perm(vector bool short __a, vector bool short __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector pixel __ATTRS_o_ai
+vec_perm(vector pixel __a, vector pixel __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector pixel)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector int __ATTRS_o_ai
+vec_perm(vector int __a, vector int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned int)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool int)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+vector float __ATTRS_o_ai
+vec_perm(vector float __a, vector float __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector float)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector float)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+/* vec_vperm */
+
+static vector signed char __ATTRS_o_ai
+vec_vperm(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vperm(vector unsigned char __a,
+          vector unsigned char __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vperm(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vperm(vector short __a, vector short __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vperm(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vperm(vector bool short __a, vector bool short __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vperm(vector pixel __a, vector pixel __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vperm(vector int __a, vector int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vperm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vperm(vector bool int __a, vector bool int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_vperm(vector float __a, vector float __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+/* vec_re */
+
+static vector float __attribute__((__always_inline__))
+vec_re(vector float __a)
+{
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_vrefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a)
+{
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static vector signed char __ATTRS_o_ai
+vec_rl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_rl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_rl(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_rl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_rl(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_rl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_vrlb */
+
+static vector signed char __ATTRS_o_ai
+vec_vrlb(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vrlb(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static vector short __ATTRS_o_ai
+vec_vrlh(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vrlh(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static vector int __ATTRS_o_ai
+vec_vrlw(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vrlw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static vector float __attribute__((__always_inline__))
+vec_round(vector float __a)
+{
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_vrfin */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a)
+{
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_rsqrte */
+
+static __vector float __attribute__((__always_inline__))
+vec_rsqrte(vector float __a)
+{
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_vrsqrtefp */
+
+static __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a)
+{
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b, vector bool char __c)
+{
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_sel(vector short __a, vector short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai
+vec_sel(vector short __a, vector short __b, vector bool short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a,
+        vector unsigned short __b,
+        vector unsigned short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b, vector bool short __c)
+{
+  return (__a & ~(vector unsigned short)__c) | (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_sel(vector int __a, vector int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai
+vec_sel(vector int __a, vector int __b, vector bool int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c)
+{
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_sel(vector float __a, vector float __b, vector unsigned int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_sel(vector float __a, vector float __b, vector bool int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_vsel */
+
+static vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector bool char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b, vector bool char __c)
+{
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector bool char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector bool short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b, vector bool short __c)
+{
+  return (__a & ~(vector unsigned short)__c) | (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector bool short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsel(vector int __a, vector int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsel(vector int __a, vector int __b, vector bool int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsel(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsel(vector unsigned int __a, vector unsigned int __b, vector bool int __c)
+{
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector bool int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsel(vector float __a, vector float __b, vector unsigned int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vsel(vector float __a, vector float __b, vector bool int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b)
+{
+  return __a << (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sl(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a << __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sl(vector short __a, vector unsigned short __b)
+{
+  return __a << (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sl(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a << __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sl(vector int __a, vector unsigned int __b)
+{
+  return __a << (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sl(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a << __b;
+}
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static vector signed char __ATTRS_o_ai
+vec_vslb(vector signed char __a, vector unsigned char __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static vector short __ATTRS_o_ai
+vec_vslh(vector short __a, vector unsigned short __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static vector int __ATTRS_o_ai
+vec_vslw(vector int __a, vector unsigned int __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static vector signed char __ATTRS_o_ai
+vec_sld(vector signed char __a, vector signed char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sld(vector unsigned char __a, vector unsigned char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_sld(vector short __a, vector short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sld(vector unsigned short __a, vector unsigned short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sld(vector pixel __a, vector pixel __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_sld(vector int __a, vector int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int __a, vector unsigned int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector float __ATTRS_o_ai
+vec_sld(vector float __a, vector float __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+/* vec_vsldoi */
+
+static vector signed char __ATTRS_o_ai
+vec_vsldoi(vector signed char __a, vector signed char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsldoi(vector unsigned char __a, vector unsigned char __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_vsldoi(vector short __a, vector short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsldoi(vector unsigned short __a, vector unsigned short __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsldoi(vector pixel __a, vector pixel __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_vsldoi(vector int __a, vector int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsldoi(vector unsigned int __a, vector unsigned int __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+static vector float __ATTRS_o_ai
+vec_vsldoi(vector float __a, vector float __b, unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c-1, __c-2, __c-3, __c-4, __c-5, __c-6, __c-7,
+     __c-8, __c-9, __c-10, __c-11, __c-12, __c-13, __c-14, __c-15));
+#else
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+#endif
+}
+
+/* vec_sll */
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsl */
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+/* vec_slo */
+
+static vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_slo(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_slo(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_slo(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_slo(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_slo(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_slo(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_slo(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_slo(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_vslo */
+
+static vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vslo(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vslo(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vslo(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vslo(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vslo(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vslo(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vslo(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vslo(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static vector signed char __ATTRS_o_ai
+vec_splat(vector signed char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_splat(vector unsigned char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_splat(vector bool char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_splat(vector short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_splat(vector unsigned short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_splat(vector bool short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_splat(vector pixel __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector int __ATTRS_o_ai
+vec_splat(vector int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_splat(vector unsigned int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_splat(vector bool int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai
+vec_splat(vector float __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static vector signed char __ATTRS_o_ai
+vec_vspltb(vector signed char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vspltb(vector unsigned char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vspltb(vector bool char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static vector short __ATTRS_o_ai
+vec_vsplth(vector short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsplth(vector unsigned short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsplth(vector bool short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsplth(vector pixel __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static vector int __ATTRS_o_ai
+vec_vspltw(vector int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vspltw(vector unsigned int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vspltw(vector bool int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai
+vec_vspltw(vector float __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai
+vec_splat_s8(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai
+vec_vspltisb(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai
+vec_splat_s16(signed char __a)
+{
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai
+vec_vspltish(signed char __a)
+{
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai
+vec_splat_s32(signed char __a)
+{
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai
+vec_vspltisw(signed char __a)
+{
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned char __ATTRS_o_ai
+vec_splat_u8(unsigned char __a)
+{
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned short __ATTRS_o_ai
+vec_splat_u16(signed char __a)
+{
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned int __ATTRS_o_ai
+vec_splat_u32(signed char __a)
+{
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static vector signed char __ATTRS_o_ai
+vec_sr(vector signed char __a, vector unsigned char __b)
+{
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sr(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a >> __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sr(vector short __a, vector unsigned short __b)
+{
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sr(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a >> __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sr(vector int __a, vector unsigned int __b)
+{
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sr(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static vector signed char __ATTRS_o_ai
+vec_vsrb(vector signed char __a, vector unsigned char __b)
+{
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsrb(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static vector short __ATTRS_o_ai
+vec_vsrh(vector short __a, vector unsigned short __b)
+{
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsrh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static vector int __ATTRS_o_ai
+vec_vsrw(vector int __a, vector unsigned int __b)
+{
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsrw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static vector signed char __ATTRS_o_ai
+vec_sra(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sra(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sra(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sra(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sra(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sra(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_vsrab */
+
+static vector signed char __ATTRS_o_ai
+vec_vsrab(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsrab(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static vector short __ATTRS_o_ai
+vec_vsrah(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsrah(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static vector int __ATTRS_o_ai
+vec_vsraw(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsraw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsr */
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+/* vec_sro */
+
+static vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sro(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sro(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sro(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sro(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sro(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sro(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_sro(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_sro(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsro */
+
+static vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsro(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsro(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsro(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsro(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsro(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsro(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsro(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsro(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static void __ATTRS_o_ai
+vec_st(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static void __ATTRS_o_ai
+vec_stvx(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static void __ATTRS_o_ai
+vec_ste(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static void __ATTRS_o_ai
+vec_stvebx(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static void __ATTRS_o_ai
+vec_stvehx(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static void __ATTRS_o_ai
+vec_stvewx(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static void __ATTRS_o_ai
+vec_stl(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static void __ATTRS_o_ai
+vec_stvxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector signed char __b)
+{
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector bool char __b)
+{
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector bool char __b)
+{
+  return __a - (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector short __a, vector short __b)
+{
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector short __b)
+{
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector short __a, vector bool short __b)
+{
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector bool short __b)
+{
+  return __a - (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector int __a, vector int __b)
+{
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector int __b)
+{
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector int __a, vector bool int __b)
+{
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector bool int __b)
+{
+  return __a - (vector unsigned int)__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_sub(vector float __a, vector float __b)
+{
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector signed char __b)
+{
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector bool char __b)
+{
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector bool char __b)
+{
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector short __a, vector short __b)
+{
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector short __b)
+{
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector short __a, vector bool short __b)
+{
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector bool short __b)
+{
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector int __a, vector int __b)
+{
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector int __b)
+{
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector int __a, vector bool int __b)
+{
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector bool int __b)
+{
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b)
+{
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_subc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_vsubcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_sum4s */
+
+static vector int __ATTRS_o_ai
+vec_sum4s(vector signed char __a, vector int __b)
+{
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sum4s(vector unsigned char __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sum4s(vector signed short __a, vector int __b)
+{
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b)
+{
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b)
+{
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)
+    vec_perm(__c, __c, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)
+    vec_perm(__c, __c, (vector unsigned char)
+	     (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11));
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3));
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11));
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+	     (4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3));
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static vector float __attribute__((__always_inline__))
+vec_trunc(vector float __a)
+{
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_vrfiz */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a)
+{
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static vector short __ATTRS_o_ai
+vec_unpackh(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_unpackh(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_unpackh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_unpackh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_unpackh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsb */
+
+static vector short __ATTRS_o_ai
+vec_vupkhsb(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vupkhsb(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static vector int __ATTRS_o_ai
+vec_vupkhsh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vupkhsh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vupkhsh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_unpackl */
+
+static vector short __ATTRS_o_ai
+vec_unpackl(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_unpackl(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_unpackl(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_unpackl(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_unpackl(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsb */
+
+static vector short __ATTRS_o_ai
+vec_vupklsb(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vupklsb(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static vector int __ATTRS_o_ai
+vec_vupklsh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vupklsh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vupklsh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector signed char __b)
+{
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector bool char __b)
+{
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector bool char __b)
+{
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector bool char __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector short __a, vector short __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector short __a, vector bool short __b)
+{
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector bool short __b)
+{
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector bool short __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector int __a, vector int __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector int __a, vector bool int __b)
+{
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector bool int __b)
+{
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector bool int __b)
+{
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vxor */
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector signed char __b)
+{
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector bool char __b)
+{
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector bool char __b)
+{
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector bool char __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector short __a, vector short __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector short __a, vector bool short __b)
+{
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector bool short __b)
+{
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector bool short __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector int __a, vector int __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector int __a, vector bool int __b)
+{
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector bool int __b)
+{
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector bool int __b)
+{
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static signed char __ATTRS_o_ai
+vec_extract(vector signed char __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai
+vec_extract(vector unsigned char __a, int __b)
+{
+  return __a[__b];
+}
+
+static short __ATTRS_o_ai
+vec_extract(vector short __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai
+vec_extract(vector unsigned short __a, int __b)
+{
+  return __a[__b];
+}
+
+static int __ATTRS_o_ai
+vec_extract(vector int __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai
+vec_extract(vector unsigned int __a, int __b)
+{
+  return __a[__b];
+}
+
+static float __ATTRS_o_ai
+vec_extract(vector float __a, int __b)
+{
+  return __a[__b];
+}
+
+/* vec_insert */
+
+static vector signed char __ATTRS_o_ai
+vec_insert(signed char __a, vector signed char __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_insert(unsigned char __a, vector unsigned char __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_insert(short __a, vector short __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector unsigned short __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_insert(int __a, vector int __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_insert(unsigned int __a, vector unsigned int __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_insert(float __a, vector float __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const signed char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const vector signed char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlx(int __a, const short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlx(int __a, const vector short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvlx(int __a, const vector pixel *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlx(int __a, const int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlx(int __a, const vector int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlx(int __a, const float *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlx(int __a, const vector float *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const signed char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlxl(int __a, const short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvlxl(int __a, const vector pixel *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlxl(int __a, const int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlxl(int __a, const float *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlxl(int __a, vector float *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const vector signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool char *__b)
+{
+  return vec_perm((vector bool char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrx(int __a, const short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrx(int __a, const vector short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool short *__b)
+{
+  return vec_perm((vector bool short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvrx(int __a, const vector pixel *__b)
+{
+  return vec_perm((vector pixel)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrx(int __a, const int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrx(int __a, const vector int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool int *__b)
+{
+  return vec_perm((vector bool int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrx(int __a, const float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrx(int __a, const vector float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool char *__b)
+{
+  return vec_perm((vector bool char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrxl(int __a, const short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool short *__b)
+{
+  return vec_perm((vector bool short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvrxl(int __a, const vector pixel *__b)
+{
+  return vec_perm((vector pixel)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrxl(int __a, const int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool int *__b)
+{
+  return vec_perm((vector bool int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrxl(int __a, const float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrxl(int __a, const vector float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static void __ATTRS_o_ai
+vec_stvlx(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector short __a, int __b, short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector short __a, int __b, vector short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector int __a, int __b, int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector int __a, int __b, vector int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector float __a, int __b, vector float *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+/* vec_stvlxl */
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector short __a, int __b, short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector short __a, int __b, vector short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector int __a, int __b, int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector int __a, int __b, vector int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector float __a, int __b, vector float *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+/* vec_stvrx */
+
+static void __ATTRS_o_ai
+vec_stvrx(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector short __a, int __b, short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector short __a, int __b, vector short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector int __a, int __b, int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector int __a, int __b, vector int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector float __a, int __b, vector float *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+/* vec_stvrxl */
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector short __a, int __b, short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector short __a, int __b, vector short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector int __a, int __b, int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector int __a, int __b, vector int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector float __a, int __b, vector float *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+/* vec_promote */
+
+static vector signed char __ATTRS_o_ai
+vec_promote(signed char __a, int __b)
+{
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_promote(unsigned char __a, int __b)
+{
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector short __ATTRS_o_ai
+vec_promote(short __a, int __b)
+{
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_promote(unsigned short __a, int __b)
+{
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector int __ATTRS_o_ai
+vec_promote(int __a, int __b)
+{
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_promote(unsigned int __a, int __b)
+{
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector float __ATTRS_o_ai
+vec_promote(float __a, int __b)
+{
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static vector signed char __ATTRS_o_ai
+vec_splats(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_splats(unsigned char __a)
+{
+  return (vector unsigned char)(__a);
+}
+
+static vector short __ATTRS_o_ai
+vec_splats(short __a)
+{
+  return (vector short)(__a);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_splats(unsigned short __a)
+{
+  return (vector unsigned short)(__a);
+}
+
+static vector int __ATTRS_o_ai
+vec_splats(int __a)
+{
+  return (vector int)(__a);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_splats(unsigned int __a)
+{
+  return (vector unsigned int)(__a);
+}
+
+static vector float __ATTRS_o_ai
+vec_splats(float __a)
+{
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static int __ATTRS_o_ai
+vec_all_eq(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector pixel __a, vector pixel __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_ge */
+
+static int __ATTRS_o_ai
+vec_all_ge(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_gt */
+
+static int __ATTRS_o_ai
+vec_all_gt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_in */
+
+static int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+}
+
+/* vec_all_lt */
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+}
+
+/* vec_all_nan */
+
+static int __attribute__((__always_inline__))
+vec_all_nan(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+}
+
+/* vec_all_ne */
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector pixel __a, vector pixel __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_nge */
+
+static int __attribute__((__always_inline__))
+vec_all_nge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_ngt */
+
+static int __attribute__((__always_inline__))
+vec_all_ngt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_nle */
+
+static int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector pixel __a, vector pixel __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_ge */
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_gt */
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_le */
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+}
+
+/* vec_any_lt */
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+}
+
+/* vec_any_nan */
+
+static int __attribute__((__always_inline__))
+vec_any_nan(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector pixel __a, vector pixel __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nge */
+
+static int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */

diff --git a/22.0.1/clang-include/ammintrin.h b/22.0.1/clang-include/ammintrin.h
new file mode 100644
index 0000000..d87b9cd
--- /dev/null
+++ b/22.0.1/clang-include/ammintrin.h

@@ -0,0 +1,68 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#ifndef __SSE4A__
+#error "SSE4A instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#endif /* __SSE4A__ */
+
+#endif /* __AMMINTRIN_H */

diff --git a/22.0.1/clang-include/arm_acle.h b/22.0.1/clang-include/arm_acle.h
new file mode 100644
index 0000000..d706745
--- /dev/null
+++ b/22.0.1/clang-include/arm_acle.h

@@ -0,0 +1,151 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Miscellaneous data-processing intrinsics */
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __clz(uint32_t t) {
+  return __builtin_clz(t);
+}
+
+static __inline__ unsigned long __attribute__((always_inline, nodebug))
+  __clzl(unsigned long t) {
+  return __builtin_clzl(t);
+}
+
+static __inline__ uint64_t __attribute__((always_inline, nodebug))
+  __clzll(uint64_t t) {
+#if __SIZEOF_LONG_LONG__ == 8
+  return __builtin_clzll(t);
+#else
+  return __builtin_clzl(t);
+#endif
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __rev(uint32_t t) {
+  return __builtin_bswap32(t);
+}
+
+static __inline__ unsigned long __attribute__((always_inline, nodebug))
+  __revl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(t);
+#else
+  return __builtin_bswap64(t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((always_inline, nodebug))
+  __revll(uint64_t t) {
+  return __builtin_bswap64(t);
+}
+
+
+/*
+ * Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+
+static __inline__ int32_t __attribute__((always_inline, nodebug))
+  __qadd(int32_t t, int32_t v) {
+  return __builtin_arm_qadd(t, v);
+}
+
+static __inline__ int32_t __attribute__((always_inline, nodebug))
+  __qsub(int32_t t, int32_t v) {
+  return __builtin_arm_qsub(t, v);
+}
+
+static __inline__ int32_t __attribute__((always_inline, nodebug))
+__qdbl(int32_t t) {
+  return __builtin_arm_qadd(t, t);
+}
+#endif
+
+/* CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32b(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32b(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32h(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32h(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32w(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32w(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32d(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32d(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32cb(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32cb(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32ch(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32ch(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32cw(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32cw(a, b);
+}
+
+static __inline__ uint32_t __attribute__((always_inline, nodebug))
+  __crc32cd(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32cd(a, b);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */

diff --git a/22.0.1/clang-include/avx2intrin.h b/22.0.1/clang-include/avx2intrin.h
new file mode 100644
index 0000000..394fdfe
--- /dev/null
+++ b/22.0.1/clang-include/avx2intrin.h

@@ -0,0 +1,1234 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a + (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a + (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a + (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
+  __m256i __a = (a); \
+  __m256i __b = (b); \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256d)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \
+                                   (((M) & 0x01) ? 16 : 0), \
+                                   (((M) & 0x02) ? 17 : 1), \
+                                   (((M) & 0x04) ? 18 : 2), \
+                                   (((M) & 0x08) ? 19 : 3), \
+                                   (((M) & 0x10) ? 20 : 4), \
+                                   (((M) & 0x20) ? 21 : 5), \
+                                   (((M) & 0x40) ? 22 : 6), \
+                                   (((M) & 0x80) ? 23 : 7), \
+                                   (((M) & 0x01) ? 24 : 8), \
+                                   (((M) & 0x02) ? 25 : 9), \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a == __b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a > (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a > __b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
+}
+
+static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a * (__v16hi)__b);
+}
+
+static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a * (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)_mm256_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) & 0x03) >> 0), \
+                                   12 + (((imm) & 0x0c) >> 2), \
+                                   12 + (((imm) & 0x30) >> 4), \
+                                   12 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+                                   (imm) & 0x3,((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) & 0x03) >> 0), \
+                                   8 + (((imm) & 0x0c) >> 2), \
+                                   8 + (((imm) & 0x30) >> 4), \
+                                   8 + (((imm) & 0xc0) >> 6), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, count) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, count) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a - (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a - (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a - (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_load_si256(__m256i *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__V);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_vbroadcastsi256(__X);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  __m128i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m128i)__builtin_shufflevector((__v4si)__V1, (__v4si)__V2, \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastq256(__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X);
+}
+
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastq128(__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  __m256d __V = (V); \
+  (__m256d)__builtin_shufflevector((__v4df)__V, (__v4df) _mm256_setzero_pd(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar8x32_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8sf)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  __m256i __V = (V); \
+  (__m256i)__builtin_shufflevector((__v4di)__V, (__v4di) _mm256_setzero_si256(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); })
+
+#define _mm256_extracti128_si256(A, O) __extension__ ({ \
+  __m256i __A = (A); \
+  (__m128i)__builtin_ia32_extract128i256(__A, (O)); })
+
+#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m128d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m128d __mask = (mask); \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)__a, (const __v2df *)__m, \
+             (__v4si)__i, (__v2df)__mask, (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m256d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m256d __mask = (mask); \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)__a, (const __v4df *)__m, \
+             (__v4si)__i, (__v4df)__mask, (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m128d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m128d __mask = (mask); \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)__a, (const __v2df *)__m, \
+             (__v2di)__i, (__v2df)__mask, (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m256d __a = (a); \
+  double const *__m = (m); \
+  __m256i __i = (i); \
+  __m256d __mask = (mask); \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)__a, (const __v4df *)__m, \
+             (__v4di)__i, (__v4df)__mask, (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)__a, (const __v4sf *)__m, \
+            (__v4si)__i, (__v4sf)__mask, (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m256 __a = (a); \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  __m256 __mask = (mask); \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)__a, (const __v8sf *)__m, \
+            (__v8si)__i, (__v8sf)__mask, (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)__a, (const __v4sf *)__m, \
+            (__v2di)__i, (__v4sf)__mask, (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)__a, (const __v4sf *)__m, \
+            (__v4di)__i, (__v4sf)__mask, (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)__a, (const __v4si *)__m, \
+            (__v4si)__i, (__v4si)__mask, (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)__a, (const __v8si *)__m, \
+            (__v8si)__i, (__v8si)__mask, (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)__a, (const __v4si *)__m, \
+            (__v2di)__i, (__v4si)__mask, (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)__a, (const __v4si *)__m, \
+            (__v4di)__i, (__v4si)__mask, (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)__a, (const __v2di *)__m, \
+             (__v4si)__i, (__v2di)__mask, (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)__a, (const __v4di *)__m, \
+             (__v4si)__i, (__v4di)__mask, (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)__a, (const __v2di *)__m, \
+             (__v2di)__i, (__v2di)__mask, (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  long long const *__m = (m); \
+  __m256i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)__a, (const __v4di *)__m, \
+             (__v4di)__i, (__v4di)__mask, (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_setzero_pd(), \
+             (const __v2df *)__m, (__v4si)__i, \
+             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_setzero_pd(), \
+             (const __v4df *)__m, (__v4si)__i, \
+             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_setzero_pd(), \
+             (const __v2df *)__m, (__v2di)__i, \
+             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_setzero_pd(), \
+             (const __v4df *)__m, (__v4di)__i, \
+             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v4si)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_setzero_ps(), \
+             (const __v8sf *)__m, (__v8si)__i, \
+             (__v8sf)_mm256_set1_ps((float)(int)-1), (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v2di)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v4di)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v4si)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_setzero_si256(), \
+            (const __v8si *)__m, (__v8si)__i, \
+            (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v2di)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v4di)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_setzero_si128(), \
+             (const __v2di *)__m, (__v4si)__i, \
+             (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_setzero_si256(), \
+             (const __v4di *)__m, (__v4si)__i, \
+             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_setzero_si128(), \
+             (const __v2di *)__m, (__v2di)__i, \
+             (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_setzero_si256(), \
+             (const __v4di *)__m, (__v4di)__i, \
+             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#endif /* __AVX2INTRIN_H */

diff --git a/22.0.1/clang-include/avxintrin.h b/22.0.1/clang-include/avxintrin.h
new file mode 100644
index 0000000..4e1044a
--- /dev/null
+++ b/22.0.1/clang-include/avxintrin.h

@@ -0,0 +1,1239 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Arithmetic */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    __m256d __V = (V); \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
+
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  __m256 __V = (V); \
+  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
+
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a | (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a | (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a ^ (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a ^ (__v8si)__b);
+}
+
+/* Horizontal arithmetic */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a,
+						  (__v8si)__c);
+}
+
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  __m128d __A = (A); \
+  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  __m256d __A = (A); \
+  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1, \
+                                   2 + (((C) & 0x4) >> 2), \
+                                   2 + (((C) & 0x8) >> 3)); })
+
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  __m128 __A = (A); \
+  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
+                                   (C) & 0x3, ((C) & 0xc) >> 2, \
+                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  __m256 __A = (A); \
+  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
+                                  (C) & 0x3, ((C) & 0xc) >> 2, \
+                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
+                                  4 + (((C) & 0x03) >> 0), \
+                                  4 + (((C) & 0x0c) >> 2), \
+                                  4 + (((C) & 0x30) >> 4), \
+                                  4 + (((C) & 0xc0) >> 6)); })
+
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m256d __V2 = (V2); \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
+
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
+
+/* Vector Blend */
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m256d __V2 = (V2); \
+  (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+
+/* Vector shuffle */
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+        __m256 __a = (a); \
+        __m256 __b = (b); \
+        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
+        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
+        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
+        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
+        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
+
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+        __m256d __a = (a); \
+        __m256d __b = (b); \
+        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
+        (mask) & 0x1, \
+        (((mask) & 0x2) >> 1) + 4, \
+        (((mask) & 0x4) >> 2) + 2, \
+        (((mask) & 0x8) >> 3) + 6); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
+
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
+
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  __m256d __a = (a); \
+  __m256d __b = (b); \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
+
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  __m256 __a = (a); \
+  __m256 __b = (b); \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
+
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
+
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
+
+/* Vector extract */
+#define _mm256_extractf128_pd(A, O) __extension__ ({ \
+  __m256d __A = (A); \
+  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
+
+#define _mm256_extractf128_ps(A, O) __extension__ ({ \
+  __m256 __A = (A); \
+  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
+
+#define _mm256_extractf128_si256(A, O) __extension__ ({ \
+  __m256i __A = (A); \
+  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi32(__m256i __a, int const __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi16(__m256i __a, int const __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return __b[__imm & 15];
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi8(__m256i __a, int const __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return __b[__imm & 31];
+}
+
+#ifdef __x86_64__
+static __inline long long  __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+/* Vector insert */
+#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m128d __V2 = (V2); \
+  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
+
+#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m128 __V2 = (V2); \
+  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
+
+#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi64(__m256i __a, int __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+/* Vector replicate */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
+}
+
+/* SIMD load ops */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  __builtin_ia32_storeupd256(__p, (__v4df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_storeups256(__p, (__v8sf)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
+}
+
+/* Conditional load ops */
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_pd(double const *__p, __m128d __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_pd(double const *__p, __m256d __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4df)__m);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_ps(float const *__p, __m128 __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_ps(float const *__p, __m256 __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
+}
+
+/* Conditional store ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+}
+
+/* Create vectors */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+	            float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+		             int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+		             short __w11, short __w10, short __w09, short __w08,
+		             short __w07, short __w06, short __w05, short __w04,
+		             short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+		            char __b27, char __b26, char __b25, char __b24,
+		            char __b23, char __b22, char __b21, char __b20,
+		            char __b19, char __b18, char __b17, char __b16,
+		            char __b15, char __b14, char __b13, char __b12,
+		            char __b11, char __b10, char __b09, char __b08,
+		            char __b07, char __b06, char __b05, char __b04,
+		            char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+		           float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+		              int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+		   short __w11, short __w10, short __w09, short __w08,
+		   short __w07, short __w06, short __w05, short __w04,
+		   short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+		             char __b27, char __b26, char __b25, char __b24,
+		             char __b23, char __b22, char __b21, char __b20,
+		             char __b19, char __b18, char __b17, char __b16,
+		             char __b15, char __b14, char __b13, char __b12,
+		             char __b11, char __b10, char __b09, char __b08,
+		             char __b07, char __b06, char __b05, char __b04,
+		             char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+		__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+		__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+		__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+/* SIMD load ops (unaligned) */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
+  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  
+  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
+  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((packed, may_alias));
+  __m256i __v256 = _mm256_castsi128_si256(
+    ((struct __loadu_si128*)__addr_lo)->__v);
+  return _mm256_insertf128_si256(__v256,
+                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
+}
+
+/* SIMD store ops (unaligned) */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  __builtin_ia32_storeups(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  __builtin_ia32_storeups(__addr_hi, __v128);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  __builtin_ia32_storeupd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  __builtin_ia32_storeupd(__addr_hi, __v128);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
+}
+
+#endif /* __AVXINTRIN_H */

diff --git a/22.0.1/clang-include/bmi2intrin.h b/22.0.1/clang-include/bmi2intrin.h
new file mode 100644
index 0000000..a05cfad
--- /dev/null
+++ b/22.0.1/clang-include/bmi2intrin.h

@@ -0,0 +1,94 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#endif /* __BMI2INTRIN_H */

diff --git a/22.0.1/clang-include/bmiintrin.h b/22.0.1/clang-include/bmiintrin.h
new file mode 100644
index 0000000..43c4a5e
--- /dev/null
+++ b/22.0.1/clang-include/bmiintrin.h

@@ -0,0 +1,148 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI__
+# error "BMI instruction set not enabled"
+#endif /* __BMI__ */
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+#define _blsr_u32(a)      (__blsr_u32((a)))
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u16(unsigned short __X)
+{
+  return __builtin_ctzs(__X);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u32(unsigned int __X)
+{
+  return __builtin_ctz(__X);
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+#define _blsr_u64(a)      (__blsr_u64((a)))
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u64(unsigned long long __X)
+{
+  return __builtin_ctzll(__X);
+}
+
+#endif /* __x86_64__ */
+
+#endif /* __BMIINTRIN_H */

diff --git a/22.0.1/clang-include/cpuid.h b/22.0.1/clang-include/cpuid.h
new file mode 100644
index 0000000..f9254e9
--- /dev/null
+++ b/22.0.1/clang-include/cpuid.h

@@ -0,0 +1,157 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE42       0x00100000
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_RDRAND      0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+/* PIC on i386 uses %ebx, so preserve it. */
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  pushl  %%ebx\n" \
+          "  cpuid\n" \
+          "  mov    %%ebx,%1\n" \
+          "  popl   %%ebx" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  pushl  %%ebx\n" \
+          "  cpuid\n" \
+          "  mov    %%ebx,%1\n" \
+          "  popl   %%ebx" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#else
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}

diff --git a/22.0.1/clang-include/emmintrin.h b/22.0.1/clang-include/emmintrin.h
new file mode 100644
index 0000000..b3f8569
--- /dev/null
+++ b/22.0.1/clang-include/emmintrin.h

@@ -0,0 +1,1451 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#ifndef __SSE2__
+#error "SSE2 instruction set not enabled"
+#else
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd(__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pd(__m128 __a)
+{
+  return __builtin_ia32_cvtps2pd(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2pd((__v4si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector(__u, __u, 1, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  struct __mm_store1_pd_struct {
+    double __u[2];
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
+  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  __builtin_ia32_storeupd(__dp, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a + (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a + (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a + (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a * (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a - (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a - (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a - (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return __a ^ __b;
+}
+
+#define _mm_slli_si128(a, count) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+   _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+
+#define _mm_srli_si128(a, count) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned. */
+  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64(__a);
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq(__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq(__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((packed, may_alias));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64x(long long q1, long long q0)
+{
+  return (__m128i){ q0, q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64(__m64 q1, __m64 q0)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi64(__m64 q0, __m64 q1)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi32(int i0, int i1, int i2, int i3)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntpd(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_ia32_movntdq(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_clflush(void const *__p)
+{
+  __builtin_ia32_clflush(__p);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_lfence(void)
+{
+  __builtin_ia32_lfence();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_mfence(void)
+{
+  __builtin_ia32_mfence();
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7); })
+
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128i __a = (a); \
+  _Pragma("clang diagnostic pop"); \
+  (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd(__a);
+}
+
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  _Pragma("clang diagnostic pop"); \
+  __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_pause(void)
+{
+  __asm__ volatile ("pause");
+}
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __SSE2__ */
+
+#endif /* __EMMINTRIN_H */

diff --git a/22.0.1/clang-include/f16cintrin.h b/22.0.1/clang-include/f16cintrin.h
new file mode 100644
index 0000000..f3614c0
--- /dev/null
+++ b/22.0.1/clang-include/f16cintrin.h

@@ -0,0 +1,58 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __F16C__
+# error "F16C instruction is not enabled"
+#endif /* __F16C__ */
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+  __m128 __a = (a); \
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__a, (imm)); })
+
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+  __m256 __a = (a); \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (imm)); })
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#endif /* __F16CINTRIN_H */

diff --git a/22.0.1/clang-include/float.h b/22.0.1/clang-include/float.h
new file mode 100644
index 0000000..02ef6bf
--- /dev/null
+++ b/22.0.1/clang-include/float.h

@@ -0,0 +1,124 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ */
+#if (defined(__MINGW32__) || defined(_MSC_VER)) && \
+    __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  undef DECIMAL_DIG
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#define DECIMAL_DIG __DECIMAL_DIG__
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+#endif /* __FLOAT_H */

diff --git a/22.0.1/clang-include/fma4intrin.h b/22.0.1/clang-include/fma4intrin.h
new file mode 100644
index 0000000..c30920d
--- /dev/null
+++ b/22.0.1/clang-include/fma4intrin.h

@@ -0,0 +1,231 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#ifndef __FMA4__
+# error "FMA4 instruction set is not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#endif /* __FMA4__ */
+
+#endif /* __FMA4INTRIN_H */

diff --git a/22.0.1/clang-include/fmaintrin.h b/22.0.1/clang-include/fmaintrin.h
new file mode 100644
index 0000000..6bfd5a8
--- /dev/null
+++ b/22.0.1/clang-include/fmaintrin.h

@@ -0,0 +1,229 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+#ifndef __FMA__
+# error "FMA instruction set is not enabled"
+#else
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#endif /* __FMA__ */
+
+#endif /* __FMAINTRIN_H */

diff --git a/22.0.1/clang-include/ia32intrin.h b/22.0.1/clang-include/ia32intrin.h
new file mode 100644
index 0000000..5adf3f1
--- /dev/null
+++ b/22.0.1/clang-include/ia32intrin.h

@@ -0,0 +1,101 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned long long __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popq %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __asm__ __volatile__ ("pushq %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned int __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popl %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __asm__ __volatile__ ("pushl %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtsc */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtsc(void) {
+  return __builtin_ia32_rdtsc();
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#endif /* __IA32INTRIN_H */

diff --git a/22.0.1/clang-include/immintrin.h b/22.0.1/clang-include/immintrin.h
new file mode 100644
index 0000000..df4bea8
--- /dev/null
+++ b/22.0.1/clang-include/immintrin.h

@@ -0,0 +1,118 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#ifdef __MMX__
+#include <mmintrin.h>
+#endif
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+
+#ifdef __SSSE3__
+#include <tmmintrin.h>
+#endif
+
+#if defined (__SSE4_2__) || defined (__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+#if defined (__AES__) || defined (__PCLMUL__)
+#include <wmmintrin.h>
+#endif
+
+#ifdef __AVX__
+#include <avxintrin.h>
+#endif
+
+#ifdef __AVX2__
+#include <avx2intrin.h>
+#endif
+
+#ifdef __BMI__
+#include <bmiintrin.h>
+#endif
+
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
+#ifdef __LZCNT__
+#include <lzcntintrin.h>
+#endif
+
+#ifdef __FMA__
+#include <fmaintrin.h>
+#endif
+
+#ifdef __RDRND__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+#ifdef __RTM__
+#include <rtmintrin.h>
+#endif
+
+/* FIXME: check __HLE__ as well when HLE is supported. */
+#if defined (__RTM__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_xtest(void)
+{
+  return __builtin_ia32_xtest();
+}
+#endif
+
+#ifdef __SHA__
+#include <shaintrin.h>
+#endif
+
+#endif /* __IMMINTRIN_H */

diff --git a/22.0.1/clang-include/iso646.h b/22.0.1/clang-include/iso646.h
new file mode 100644
index 0000000..dca13c5
--- /dev/null
+++ b/22.0.1/clang-include/iso646.h

@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */

diff --git a/22.0.1/clang-include/limits.h b/22.0.1/clang-include/limits.h
new file mode 100644
index 0000000..f04187c
--- /dev/null
+++ b/22.0.1/clang-include/limits.h

@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */

diff --git a/22.0.1/clang-include/lzcntintrin.h b/22.0.1/clang-include/lzcntintrin.h
new file mode 100644
index 0000000..62ab5ca
--- /dev/null
+++ b/22.0.1/clang-include/lzcntintrin.h

@@ -0,0 +1,55 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNT__
+# error "LZCNT instruction is not enabled"
+#endif /* __LZCNT__ */
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__lzcnt16(unsigned short __X)
+{
+  return __builtin_clzs(__X);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__lzcnt32(unsigned int __X)
+{
+  return __builtin_clz(__X);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__lzcnt64(unsigned long long __X)
+{
+  return __builtin_clzll(__X);
+}
+#endif
+
+#endif /* __LZCNTINTRIN_H */

diff --git a/22.0.1/clang-include/mm3dnow.h b/22.0.1/clang-include/mm3dnow.h
new file mode 100644
index 0000000..5242d99
--- /dev/null
+++ b/22.0.1/clang-include/mm3dnow.h

@@ -0,0 +1,162 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_femms() {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#endif

diff --git a/22.0.1/clang-include/mm_malloc.h b/22.0.1/clang-include/mm_malloc.h
new file mode 100644
index 0000000..305afd3
--- /dev/null
+++ b/22.0.1/clang-include/mm_malloc.h

@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */

diff --git a/22.0.1/clang-include/mmintrin.h b/22.0.1/clang-include/mmintrin.h
new file mode 100644
index 0000000..986870a
--- /dev/null
+++ b/22.0.1/clang-include/mmintrin.h

@@ -0,0 +1,503 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+#ifndef __MMX__
+#error "MMX instruction set not enabled"
+#else
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_pi16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq(__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);       
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMX__ */
+
+#endif /* __MMINTRIN_H */
+

diff --git a/22.0.1/clang-include/module.modulemap b/22.0.1/clang-include/module.modulemap
new file mode 100644
index 0000000..9f7944d
--- /dev/null
+++ b/22.0.1/clang-include/module.modulemap

@@ -0,0 +1,156 @@
+module _Builtin_intrinsics [system] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    header "x86intrin.h"
+
+    explicit module mm_malloc {
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      requires x86
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      requires mmx
+      header "mmintrin.h"
+    }
+
+    explicit module f16c {
+      requires f16c
+      header "f16cintrin.h"
+    }
+
+    explicit module sse {
+      requires sse
+      export mmx
+      export * // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      requires sse2
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      requires sse3
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      requires ssse3
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      requires sse41
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      requires sse42
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      requires sse4a
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module avx {
+      requires avx
+      export sse4_2
+      header "avxintrin.h"
+    }
+
+    explicit module avx2 {
+      requires avx2
+      export avx
+      header "avx2intrin.h"
+    }
+
+    explicit module bmi {
+      requires bmi
+      header "bmiintrin.h"
+    }
+
+    explicit module bmi2 {
+      requires bmi2
+      header "bmi2intrin.h"
+    }
+
+    explicit module fma {
+      requires fma
+      header "fmaintrin.h"
+    }
+
+    explicit module fma4 {
+      requires fma4
+      export sse3
+      header "fma4intrin.h"
+    }
+
+    explicit module lzcnt {
+      requires lzcnt
+      header "lzcntintrin.h"
+    }
+
+    explicit module popcnt {
+      requires popcnt
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      requires mm3dnow
+      header "mm3dnow.h"
+    }
+
+    explicit module xop {
+      requires xop
+      export fma4
+      header "xopintrin.h"
+    }
+
+    explicit module aes_pclmul {
+      requires aes, pclmul
+      header "wmmintrin.h"
+    }
+
+    explicit module aes {
+      requires aes
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      requires pclmul
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+}

diff --git a/22.0.1/clang-include/nmmintrin.h b/22.0.1/clang-include/nmmintrin.h
new file mode 100644
index 0000000..f12622d
--- /dev/null
+++ b/22.0.1/clang-include/nmmintrin.h

@@ -0,0 +1,35 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+#ifndef __SSE4_2__
+#error "SSE4.2 instruction set not enabled"
+#else
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* __SSE4_2__ */
+#endif /* _NMMINTRIN_H */

diff --git a/22.0.1/clang-include/pmmintrin.h b/22.0.1/clang-include/pmmintrin.h
new file mode 100644
index 0000000..6f1fc32
--- /dev/null
+++ b/22.0.1/clang-include/pmmintrin.h

@@ -0,0 +1,117 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#ifndef __SSE3__
+#error "SSE3 instruction set not enabled"
+#else
+
+#include <emmintrin.h>
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd(__a, __b);
+}
+
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#endif /* __SSE3__ */
+
+#endif /* __PMMINTRIN_H */

diff --git a/22.0.1/clang-include/popcntintrin.h b/22.0.1/clang-include/popcntintrin.h
new file mode 100644
index 0000000..d439daa
--- /dev/null
+++ b/22.0.1/clang-include/popcntintrin.h

@@ -0,0 +1,45 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __POPCNT__
+#error "POPCNT instruction set not enabled"
+#endif
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#endif /* _POPCNTINTRIN_H */

diff --git a/22.0.1/clang-include/prfchwintrin.h b/22.0.1/clang-include/prfchwintrin.h
new file mode 100644
index 0000000..9825bd8
--- /dev/null
+++ b/22.0.1/clang-include/prfchwintrin.h

@@ -0,0 +1,39 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */

diff --git a/22.0.1/clang-include/rdseedintrin.h b/22.0.1/clang-include/rdseedintrin.h
new file mode 100644
index 0000000..0fef1fa
--- /dev/null
+++ b/22.0.1/clang-include/rdseedintrin.h

@@ -0,0 +1,52 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+#ifdef __RDSEED__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+#endif /* __RDSEED__ */
+#endif /* __RDSEEDINTRIN_H */

diff --git a/22.0.1/clang-include/rtmintrin.h b/22.0.1/clang-include/rtmintrin.h
new file mode 100644
index 0000000..26149ca
--- /dev/null
+++ b/22.0.1/clang-include/rtmintrin.h

@@ -0,0 +1,54 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#endif /* __RTMINTRIN_H */

diff --git a/22.0.1/clang-include/shaintrin.h b/22.0.1/clang-include/shaintrin.h
new file mode 100644
index 0000000..66ed055
--- /dev/null
+++ b/22.0.1/clang-include/shaintrin.h

@@ -0,0 +1,74 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+#if !defined (__SHA__)
+#  error "SHA instructions not enabled"
+#endif
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((V1), (V2), (M)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha1nexte(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha1msg1(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha1msg2(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return __builtin_ia32_sha256rnds2(__X, __Y, __Z);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha256msg1(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return __builtin_ia32_sha256msg2(__X, __Y);
+}
+
+#endif /* __SHAINTRIN_H */

diff --git a/22.0.1/clang-include/smmintrin.h b/22.0.1/clang-include/smmintrin.h
new file mode 100644
index 0000000..6e35734
--- /dev/null
+++ b/22.0.1/clang-include/smmintrin.h

@@ -0,0 +1,482 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#ifndef __SSE4_1__
+#error "SSE4.1 instruction set not enabled"
+#else
+
+#include <tmmintrin.h>
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  __m128 __X = (X); \
+  (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  __m128d __X = (X); \
+  (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  __m128d __V1 = (V1); \
+  __m128d __V2 = (V2); \
+  (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  __m128 __V1 = (V1); \
+  __m128 __V2 = (V2); \
+  (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  __m128i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_stream_load_si128 (__m128i *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+                                                    
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+                                           
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+                                             
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
+                                                   __a[(N) & 15] = (I);             \
+                                                   __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
+                                                    __a[(N) & 3] = (I);           \
+                                                    __a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                    __a[(N) & 1] = (I);           \
+                                                    __a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
+                                                 (int)(unsigned char) \
+                                                     __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
+                                                  __a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); 
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  __m128i __X = (X); \
+  __m128i __Y = (Y); \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+#ifdef __SSE4_2__
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
+#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
+     
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+     __builtin_ia32_pcmpistria128((A), (B), (M))
+#define _mm_cmpistrc(A, B, M) \
+     __builtin_ia32_pcmpistric128((A), (B), (M))
+#define _mm_cmpistro(A, B, M) \
+     __builtin_ia32_pcmpistrio128((A), (B), (M))
+#define _mm_cmpistrs(A, B, M) \
+     __builtin_ia32_pcmpistris128((A), (B), (M))
+#define _mm_cmpistrz(A, B, M) \
+     __builtin_ia32_pcmpistriz128((A), (B), (M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* __SSE4_2__ */
+#endif /* __SSE4_1__ */
+
+#endif /* _SMMINTRIN_H */

diff --git a/22.0.1/clang-include/stdalign.h b/22.0.1/clang-include/stdalign.h
new file mode 100644
index 0000000..3738d12
--- /dev/null
+++ b/22.0.1/clang-include/stdalign.h

@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */

diff --git a/22.0.1/clang-include/stdarg.h b/22.0.1/clang-include/stdarg.h
new file mode 100644
index 0000000..a57e183
--- /dev/null
+++ b/22.0.1/clang-include/stdarg.h

@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */

diff --git a/22.0.1/clang-include/stdbool.h b/22.0.1/clang-include/stdbool.h
new file mode 100644
index 0000000..0467893
--- /dev/null
+++ b/22.0.1/clang-include/stdbool.h

@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */

diff --git a/22.0.1/clang-include/stddef.h b/22.0.1/clang-include/stddef.h
new file mode 100644
index 0000000..2dfe0a2
--- /dev/null
+++ b/22.0.1/clang-include/stddef.h

@@ -0,0 +1,146 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+#define __STDDEF_H
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__STDDEF_H)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__STDDEF_H) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__STDDEF_H)
+
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#if !defined(__CLANG_MAX_ALIGN_T_DEFINED) || __has_feature(modules)
+#ifndef _MSC_VER
+typedef struct {
+  long long __clang_max_align_nonce1
+      __attribute__((__aligned__(__alignof__(long long))));
+  long double __clang_max_align_nonce2
+      __attribute__((__aligned__(__alignof__(long double))));
+} max_align_t;
+#else
+typedef double max_align_t;
+#endif
+#define __CLANG_MAX_ALIGN_T_DEFINED
+#endif
+#endif
+
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#endif  /* __STDDEF_H */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif

diff --git a/22.0.1/clang-include/stdint.h b/22.0.1/clang-include/stdint.h
new file mode 100644
index 0000000..2b1bc09
--- /dev/null
+++ b/22.0.1/clang-include/stdint.h

@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and 
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types. 
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).  
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef signed __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef unsigned __INT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef signed __INT56_TYPE__ int56_t;
+typedef unsigned __INT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef signed __INT48_TYPE__ int48_t;
+typedef unsigned __INT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef signed __INT40_TYPE__ int40_t;
+typedef unsigned __INT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef signed __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef unsigned __INT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef signed __INT24_TYPE__ int24_t;
+typedef unsigned __INT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef signed __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef unsigned __INT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef signed __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef unsigned __INT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined  
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined 
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are 
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types. 
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */

diff --git a/22.0.1/clang-include/stdnoreturn.h b/22.0.1/clang-include/stdnoreturn.h
new file mode 100644
index 0000000..a7a301d
--- /dev/null
+++ b/22.0.1/clang-include/stdnoreturn.h

@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */

diff --git a/22.0.1/clang-include/tbmintrin.h b/22.0.1/clang-include/tbmintrin.h
new file mode 100644
index 0000000..f95e34f
--- /dev/null
+++ b/22.0.1/clang-include/tbmintrin.h

@@ -0,0 +1,158 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TBM__
+#error "TBM instruction set is not enabled"
+#endif
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+#define __bextri_u32(a, b) (__builtin_ia32_bextri_u32((a), (b)))
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcfill_u32(unsigned int a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blci_u32(unsigned int a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcic_u32(unsigned int a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcmsk_u32(unsigned int a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcs_u32(unsigned int a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsfill_u32(unsigned int a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsic_u32(unsigned int a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__t1mskc_u32(unsigned int a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__tzmsk_u32(unsigned int a)
+{
+  return ~a & (a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) (__builtin_ia32_bextri_u64((a), (int)(b)))
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcfill_u64(unsigned long long a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blci_u64(unsigned long long a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcic_u64(unsigned long long a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcmsk_u64(unsigned long long a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcs_u64(unsigned long long a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blsfill_u64(unsigned long long a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blsic_u64(unsigned long long a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__t1mskc_u64(unsigned long long a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__tzmsk_u64(unsigned long long a)
+{
+  return ~a & (a - 1);
+}
+#endif
+
+#endif /* __TBMINTRIN_H */

diff --git a/22.0.1/clang-include/tgmath.h b/22.0.1/clang-include/tgmath.h
new file mode 100644
index 0000000..a48e267
--- /dev/null
+++ b/22.0.1/clang-include/tgmath.h

@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y) 
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */

diff --git a/22.0.1/clang-include/tmmintrin.h b/22.0.1/clang-include/tmmintrin.h
new file mode 100644
index 0000000..4238f5b
--- /dev/null
+++ b/22.0.1/clang-include/tmmintrin.h

@@ -0,0 +1,225 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#ifndef __SSSE3__
+#error "SSSE3 instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  __m128i __a = (a); \
+  __m128i __b = (b); \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); })
+
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  __m64 __a = (a); \
+  __m64 __b = (b); \
+  (__m64)__builtin_ia32_palignr((__v8qi)__a, (__v8qi)__b, (n)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#endif /* __SSSE3__ */
+
+#endif /* __TMMINTRIN_H */

diff --git a/22.0.1/clang-include/unwind.h b/22.0.1/clang-include/unwind.h
new file mode 100644
index 0000000..685c1df
--- /dev/null
+++ b/22.0.1/clang-include/unwind.h

@@ -0,0 +1,280 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) and libunwind provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((unavailable));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((unavailable));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((unavailable));
+void __register_frame_info(const void *, void *) __attribute__((unavailable));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((unavailable));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((unavailable));
+void __register_frame_table(const void *) __attribute__((unavailable));
+void __deregister_frame_info(const void *) __attribute__((unavailable));
+void __deregister_frame_info_bases(const void *)__attribute__((unavailable));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */

diff --git a/22.0.1/clang-include/varargs.h b/22.0.1/clang-include/varargs.h
new file mode 100644
index 0000000..b5477d0
--- /dev/null
+++ b/22.0.1/clang-include/varargs.h

@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif

diff --git a/22.0.1/clang-include/wmmintrin.h b/22.0.1/clang-include/wmmintrin.h
new file mode 100644
index 0000000..369e3c2
--- /dev/null
+++ b/22.0.1/clang-include/wmmintrin.h

@@ -0,0 +1,42 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#if !defined (__AES__) && !defined (__PCLMUL__)
+# error "AES/PCLMUL instructions not enabled"
+#else
+
+#ifdef __AES__
+#include <__wmmintrin_aes.h>
+#endif /* __AES__ */
+
+#ifdef __PCLMUL__
+#include <__wmmintrin_pclmul.h>
+#endif /* __PCLMUL__ */
+
+#endif /* __AES__ || __PCLMUL__ */
+#endif /* _WMMINTRIN_H */

diff --git a/22.0.1/clang-include/x86intrin.h b/22.0.1/clang-include/x86intrin.h
new file mode 100644
index 0000000..21a43da
--- /dev/null
+++ b/22.0.1/clang-include/x86intrin.h

@@ -0,0 +1,81 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#ifdef __3dNOW__
+#include <mm3dnow.h>
+#endif
+
+#ifdef __BMI__
+#include <bmiintrin.h>
+#endif
+
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
+#ifdef __LZCNT__
+#include <lzcntintrin.h>
+#endif
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#ifdef __RDSEED__
+#include <rdseedintrin.h>
+#endif
+
+#ifdef __PRFCHW__
+#include <prfchwintrin.h>
+#endif
+
+#ifdef __SSE4A__
+#include <ammintrin.h>
+#endif
+
+#ifdef __FMA4__
+#include <fma4intrin.h>
+#endif
+
+#ifdef __XOP__
+#include <xopintrin.h>
+#endif
+
+#ifdef __TBM__
+#include <tbmintrin.h>
+#endif
+
+#ifdef __F16C__
+#include <f16cintrin.h>
+#endif
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */

diff --git a/22.0.1/clang-include/xmmintrin.h b/22.0.1/clang-include/xmmintrin.h
new file mode 100644
index 0000000..c9befcb
--- /dev/null
+++ b/22.0.1/clang-include/xmmintrin.h

@@ -0,0 +1,1003 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+ 
+#ifndef __SSE__
+#error "SSE instruction set not enabled"
+#else
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 1);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 1);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 2);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 2);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 1),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 1);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 2),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 2);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 4);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 4);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 6);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 6);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 5),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpss(__b, __a, 6),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__b, __a, 6);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpss(__a, __b, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpps(__a, __b, 3);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64(__a);
+}
+
+#endif
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si64(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_storeups(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
+  _mm_storeu_ps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps1(float *__p, __m128 __a)
+{
+    return _mm_store1_ps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128 *)__p = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_sfence(void)
+{
+  __builtin_ia32_sfence();
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_pi16(__m64 __a, int __n)
+{
+  __v4hi __b = (__v4hi)__a;
+  return (unsigned short)__b[__n & 3];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_insert_pi16(__m64 __a, int __d, int __n)
+{
+   __v4hi __b = (__v4hi)__a;
+   __b[__n & 3] = __d;
+   return (__m64)__b;
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  __m64 __a = (a); \
+  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_getcsr(void)
+{
+  return __builtin_ia32_stmxcsr();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_setcsr(unsigned int __i)
+{
+  __builtin_ia32_ldmxcsr(__i);
+}
+
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
+                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
+                                  (((mask) & 0x30) >> 4) + 4, \
+                                  (((mask) & 0xc0) >> 6) + 4); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+  
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+  
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+  
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+  
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+  
+  return _mm_packs_pi32(__b, __c);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+  
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+  
+  return _mm_packs_pi16(__b, __c);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps(__a);
+}
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#endif /* __SSE__ */
+
+#endif /* __XMMINTRIN_H */

diff --git a/22.0.1/clang-include/xopintrin.h b/22.0.1/clang-include/xopintrin.h
new file mode 100644
index 0000000..cc94ca0
--- /dev/null
+++ b/22.0.1/clang-include/xopintrin.h

@@ -0,0 +1,804 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#ifndef __XOP__
+# error "XOP instruction set is not enabled"
+#else
+
+#include <fma4intrin.h>
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  __m128i __C = (C); \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
+                                     (__v2di)__C, (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  __m256d __X = (X); \
+  __m256d __Y = (Y); \
+  __m256i __C = (C); \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
+                                        (__v4di)__C, (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  __m128i __C = (C); \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
+                                    (__v4si)__C, (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  __m256 __X = (X); \
+  __m256 __Y = (Y); \
+  __m256i __C = (C); \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
+                                       (__v8si)__C, (I)); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#endif /* __XOP__ */
+
+#endif /* __XOPINTRIN_H */

diff --git a/22.0.1/include/rs_allocation.rsh b/22.0.1/include/rs_allocation.rsh
new file mode 100644
index 0000000..03bd1ca
--- /dev/null
+++ b/22.0.1/include/rs_allocation.rsh

@@ -0,0 +1,430 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_allocation.rsh
+ *  \brief Allocation routines
+ *
+ *
+ */
+
+#ifndef __RS_ALLOCATION_RSH__
+#define __RS_ALLOCATION_RSH__
+
+/**
+ * Returns the Allocation for a given pointer.  The pointer should point within
+ * a valid allocation.  The results are undefined if the pointer is not from a
+ * valid allocation.
+ *
+ * This function is deprecated and will be removed in the SDK from a future
+ * release.
+ */
+extern rs_allocation __attribute__((overloadable))
+    rsGetAllocation(const void *);
+
+/**
+ * Query the dimension of an allocation.
+ *
+ * @return uint32_t The X dimension of the allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimX(rs_allocation);
+
+/**
+ * Query the dimension of an allocation.
+ *
+ * @return uint32_t The Y dimension of the allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimY(rs_allocation);
+
+/**
+ * Query the dimension of an allocation.
+ *
+ * @return uint32_t The Z dimension of the allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimZ(rs_allocation);
+
+/**
+ * Query an allocation for the presence of more than one LOD.
+ *
+ * @return uint32_t Returns 1 if more than one LOD is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimLOD(rs_allocation);
+
+/**
+ * Query an allocation for the presence of more than one face.
+ *
+ * @return uint32_t Returns 1 if more than one face is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimFaces(rs_allocation);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * Copy part of an allocation from another allocation.
+ *
+ * @param dstAlloc Allocation to copy data into.
+ * @param dstOff The offset of the first element to be copied in
+ *               the destination allocation.
+ * @param dstMip Mip level in the destination allocation.
+ * @param count The number of elements to be copied.
+ * @param srcAlloc The source data allocation.
+ * @param srcOff The offset of the first element in data to be
+ *               copied in the source allocation.
+ * @param srcMip Mip level in the source allocation.
+ */
+extern void __attribute__((overloadable))
+    rsAllocationCopy1DRange(rs_allocation dstAlloc,
+                            uint32_t dstOff, uint32_t dstMip,
+                            uint32_t count,
+                            rs_allocation srcAlloc,
+                            uint32_t srcOff, uint32_t srcMip);
+
+/**
+ * Copy a rectangular region into the allocation from another
+ * allocation.
+ *
+ * @param dstAlloc allocation to copy data into.
+ * @param dstXoff X offset of the region to update in the
+ *                destination allocation.
+ * @param dstYoff Y offset of the region to update in the
+ *                destination allocation.
+ * @param dstMip Mip level in the destination allocation.
+ * @param dstFace Cubemap face of the destination allocation,
+ *                ignored for allocations that aren't cubemaps.
+ * @param width Width of the incoming region to update.
+ * @param height Height of the incoming region to update.
+ * @param srcAlloc The source data allocation.
+ * @param srcXoff X offset in data of the source allocation.
+ * @param srcYoff Y offset in data of the source allocation.
+ * @param srcMip Mip level in the source allocation.
+ * @param srcFace Cubemap face of the source allocation,
+ *                ignored for allocations that aren't cubemaps.
+ */
+extern void __attribute__((overloadable))
+    rsAllocationCopy2DRange(rs_allocation dstAlloc,
+                            uint32_t dstXoff, uint32_t dstYoff,
+                            uint32_t dstMip,
+                            rs_allocation_cubemap_face dstFace,
+                            uint32_t width, uint32_t height,
+                            rs_allocation srcAlloc,
+                            uint32_t srcXoff, uint32_t srcYoff,
+                            uint32_t srcMip,
+                            rs_allocation_cubemap_face srcFace);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+/**
+ * Extract a single element from an allocation.
+ */
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x);
+/**
+ * \overload
+ */
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y);
+/**
+ * \overload
+ */
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+    #define GET_ELEMENT_AT(T) \
+    extern T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x); \
+    extern T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y);  \
+    extern T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#else
+    #define GET_ELEMENT_AT(T) \
+    static inline T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x) {  \
+        return ((T *)rsGetElementAt(a, x))[0]; \
+    } \
+    static inline T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y) {  \
+        return ((T *)rsGetElementAt(a, x, y))[0]; \
+    } \
+    static inline T __attribute__((overloadable)) \
+            rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {  \
+        return ((T *)rsGetElementAt(a, x, y, z))[0]; \
+    }
+#endif
+
+GET_ELEMENT_AT(char)
+GET_ELEMENT_AT(char2)
+GET_ELEMENT_AT(char3)
+GET_ELEMENT_AT(char4)
+GET_ELEMENT_AT(uchar)
+GET_ELEMENT_AT(uchar2)
+GET_ELEMENT_AT(uchar3)
+GET_ELEMENT_AT(uchar4)
+GET_ELEMENT_AT(short)
+GET_ELEMENT_AT(short2)
+GET_ELEMENT_AT(short3)
+GET_ELEMENT_AT(short4)
+GET_ELEMENT_AT(ushort)
+GET_ELEMENT_AT(ushort2)
+GET_ELEMENT_AT(ushort3)
+GET_ELEMENT_AT(ushort4)
+GET_ELEMENT_AT(int)
+GET_ELEMENT_AT(int2)
+GET_ELEMENT_AT(int3)
+GET_ELEMENT_AT(int4)
+GET_ELEMENT_AT(uint)
+GET_ELEMENT_AT(uint2)
+GET_ELEMENT_AT(uint3)
+GET_ELEMENT_AT(uint4)
+GET_ELEMENT_AT(long)
+GET_ELEMENT_AT(long2)
+GET_ELEMENT_AT(long3)
+GET_ELEMENT_AT(long4)
+GET_ELEMENT_AT(ulong)
+GET_ELEMENT_AT(ulong2)
+GET_ELEMENT_AT(ulong3)
+GET_ELEMENT_AT(ulong4)
+GET_ELEMENT_AT(float)
+GET_ELEMENT_AT(float2)
+GET_ELEMENT_AT(float3)
+GET_ELEMENT_AT(float4)
+GET_ELEMENT_AT(double)
+GET_ELEMENT_AT(double2)
+GET_ELEMENT_AT(double3)
+GET_ELEMENT_AT(double4)
+
+#undef GET_ELEMENT_AT
+
+// Jelly Bean
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Send the contents of the Allocation to the queue.
+ * @param a allocation to work on
+ */
+extern const void __attribute__((overloadable))
+    rsAllocationIoSend(rs_allocation a);
+
+/**
+ * Receive a new set of contents from the queue.
+ * @param a allocation to work on
+ */
+extern const void __attribute__((overloadable))
+    rsAllocationIoReceive(rs_allocation a);
+
+
+/**
+ * Get the element object describing the allocation's layout
+ * @param a allocation to get data from
+ * @return element describing allocation layout
+ */
+extern rs_element __attribute__((overloadable))
+    rsAllocationGetElement(rs_allocation a);
+
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 1D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location);
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 1D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ * @param lod mip level to sample from, for fractional values
+ *            mip levels will be interpolated if
+ *            RS_SAMPLER_LINEAR_MIP_LINEAR is used
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location, float lod);
+
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 2D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location);
+
+/**
+ * Fetch allocation in a way described by the sampler
+ * @param a 2D allocation to sample from
+ * @param s sampler state
+ * @param location to sample from
+ * @param lod mip level to sample from, for fractional values
+ *            mip levels will be interpolated if
+ *            RS_SAMPLER_LINEAR_MIP_LINEAR is used
+ */
+extern const float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location, float lod);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+
+/**
+ * Set single element of an allocation.
+ */
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x);
+
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y);
+
+#define SET_ELEMENT_AT(T)                                               \
+    extern void __attribute__((overloadable))                           \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x);             \
+    extern void __attribute__((overloadable))                           \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y); \
+    extern void __attribute__((overloadable))                           \
+    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z);
+
+
+SET_ELEMENT_AT(char)
+SET_ELEMENT_AT(char2)
+SET_ELEMENT_AT(char3)
+SET_ELEMENT_AT(char4)
+SET_ELEMENT_AT(uchar)
+SET_ELEMENT_AT(uchar2)
+SET_ELEMENT_AT(uchar3)
+SET_ELEMENT_AT(uchar4)
+SET_ELEMENT_AT(short)
+SET_ELEMENT_AT(short2)
+SET_ELEMENT_AT(short3)
+SET_ELEMENT_AT(short4)
+SET_ELEMENT_AT(ushort)
+SET_ELEMENT_AT(ushort2)
+SET_ELEMENT_AT(ushort3)
+SET_ELEMENT_AT(ushort4)
+SET_ELEMENT_AT(int)
+SET_ELEMENT_AT(int2)
+SET_ELEMENT_AT(int3)
+SET_ELEMENT_AT(int4)
+SET_ELEMENT_AT(uint)
+SET_ELEMENT_AT(uint2)
+SET_ELEMENT_AT(uint3)
+SET_ELEMENT_AT(uint4)
+SET_ELEMENT_AT(long)
+SET_ELEMENT_AT(long2)
+SET_ELEMENT_AT(long3)
+SET_ELEMENT_AT(long4)
+SET_ELEMENT_AT(ulong)
+SET_ELEMENT_AT(ulong2)
+SET_ELEMENT_AT(ulong3)
+SET_ELEMENT_AT(ulong4)
+SET_ELEMENT_AT(float)
+SET_ELEMENT_AT(float2)
+SET_ELEMENT_AT(float3)
+SET_ELEMENT_AT(float4)
+SET_ELEMENT_AT(double)
+SET_ELEMENT_AT(double2)
+SET_ELEMENT_AT(double3)
+SET_ELEMENT_AT(double4)
+
+#undef SET_ELEMENT_AT
+
+
+/**
+ * Extract a single element from an allocation.
+ */
+extern const uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_Y(rs_allocation a, uint32_t x, uint32_t y);
+
+/**
+ * Extract a single element from an allocation.
+ *
+ * Coordinates are in the dimensions of the Y plane
+ */
+extern const uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y);
+
+/**
+ * Extract a single element from an allocation.
+ *
+ * Coordinates are in the dimensions of the Y plane
+ */
+extern const uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 18))
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+
+#define VOP(T)                                                                   \
+    extern T __attribute__((overloadable))                                       \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x);                         \
+    extern T __attribute__((overloadable))                                       \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y);             \
+    extern T __attribute__((overloadable))                                       \
+    rsAllocationVLoadX_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z); \
+    extern void __attribute__((overloadable))                                    \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x);                 \
+    extern void __attribute__((overloadable))                                    \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y);     \
+    extern void __attribute__((overloadable))                                    \
+    rsAllocationVStoreX_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z);
+
+VOP(char2)
+VOP(char3)
+VOP(char4)
+VOP(uchar2)
+VOP(uchar3)
+VOP(uchar4)
+VOP(short2)
+VOP(short3)
+VOP(short4)
+VOP(ushort2)
+VOP(ushort3)
+VOP(ushort4)
+VOP(int2)
+VOP(int3)
+VOP(int4)
+VOP(uint2)
+VOP(uint3)
+VOP(uint4)
+VOP(long2)
+VOP(long3)
+VOP(long4)
+VOP(ulong2)
+VOP(ulong3)
+VOP(ulong4)
+VOP(float2)
+VOP(float3)
+VOP(float4)
+VOP(double2)
+VOP(double3)
+VOP(double4)
+
+#undef VOP
+
+#endif //(defined(RS_VERSION) && (RS_VERSION >= 22))
+
+
+#endif
+

diff --git a/22.0.1/include/rs_atomic.rsh b/22.0.1/include/rs_atomic.rsh
new file mode 100644
index 0000000..ba847cf
--- /dev/null
+++ b/22.0.1/include/rs_atomic.rsh

@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_atomic.rsh
+ *  \brief Atomic routines
+ *
+ *
+ */
+
+#ifndef __RS_ATOMIC_RSH__
+#define __RS_ATOMIC_RSH__
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * Atomic add one to the value at addr.
+ * Equal to rsAtomicAdd(addr, 1)
+ *
+ * @param addr Address of value to increment
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile int32_t* addr);
+
+/**
+ * Atomic subtract one from the value at addr. Equal to rsAtomicSub(addr, 1)
+ *
+ * @param addr Address of value to decrement
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile int32_t* addr);
+
+/**
+ * Atomic add a value to the value at addr.  addr[0] += value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to add to the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Subtract a value from the value at addr.  addr[0] -= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to subtract from the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Bitwise and a value from the value at addr.  addr[0] &= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to and with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Bitwise or a value from the value at addr.  addr[0] |= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to or with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Bitwise xor a value from the value at addr.  addr[0] ^= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to xor with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Set the value at addr to the min of addr and value
+ * addr[0] = rsMin(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMin(volatile uint32_t* addr, uint32_t value);
+/**
+ * Atomic Set the value at addr to the min of addr and value
+ * addr[0] = rsMin(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicMin(volatile int32_t* addr, int32_t value);
+
+/**
+ * Atomic Set the value at addr to the max of addr and value
+ * addr[0] = rsMax(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMax(volatile uint32_t* addr, uint32_t value);
+/**
+ * Atomic Set the value at addr to the max of addr and value
+ * addr[0] = rsMin(addr[0], value)
+ *
+ * @param addr Address of value to modify
+ * @param value comparison value
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicMax(volatile int32_t* addr, int32_t value);
+
+/**
+ * Compare-and-set operation with a full memory barrier.
+ *
+ * If the value at addr matches compareValue then newValue is written.
+ *
+ * @param addr The address to compare and replace if the compare passes.
+ * @param compareValue The value to test addr[0] against.
+ * @param newValue The value to write if the test passes.
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicCas(volatile int32_t* addr, int32_t compareValue, int32_t newValue);
+
+/**
+ * Compare-and-set operation with a full memory barrier.
+ *
+ * If the value at addr matches compareValue then newValue is written.
+ *
+ * @param addr The address to compare and replace if the compare passes.
+ * @param compareValue The value to test addr[0] against.
+ * @param newValue The value to write if the test passes.
+ *
+ * @return old value
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAtomicCas(volatile uint32_t* addr, uint32_t compareValue, uint32_t newValue);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))   // TODO: api 21
+
+/**
+ * Atomic add one to the value at addr.
+ * Equal to rsAtomicAdd(addr, 1)
+ *
+ * @param addr Address of value to increment
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile uint32_t* addr);
+
+/**
+ * Atomic subtract one from the value at addr. Equal to rsAtomicSub(addr, 1)
+ *
+ * @param addr Address of value to decrement
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile uint32_t* addr);
+
+/**
+ * Atomic add a value to the value at addr.  addr[0] += value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to add to the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Subtract a value from the value at addr.  addr[0] -= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to subtract from the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Bitwise and a value from the value at addr.  addr[0] &= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to and with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Bitwise or a value from the value at addr.  addr[0] |= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to or with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile uint32_t* addr, uint32_t value);
+
+/**
+ * Atomic Bitwise xor a value from the value at addr.  addr[0] ^= value
+ *
+ * @param addr Address of value to modify
+ * @param value Amount to xor with the value at addr
+ *
+ * @return old value
+ */
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile uint32_t* addr, uint32_t value);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 21)
+
+#endif
+

diff --git a/22.0.1/include/rs_core.rsh b/22.0.1/include/rs_core.rsh
new file mode 100644
index 0000000..3489e44
--- /dev/null
+++ b/22.0.1/include/rs_core.rsh

@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*! \mainpage notitle
+  *
+  * RenderScript is a high-performance runtime that provides
+  * compute operations at the native level. RenderScript code is compiled on devices
+  * at runtime to allow platform-independence as well.
+  * This reference documentation describes the RenderScript runtime APIs, which you
+  * can utilize to write RenderScript code in C99. The RenderScript compute header
+  * files are automatically included for you.
+  *
+  * To use RenderScript, you need to utilize the RenderScript runtime APIs documented here
+  * as well as the Android framework APIs for RenderScript.
+  * For documentation on the Android framework APIs, see the <a target="_parent" href=
+  * "http://developer.android.com/reference/android/renderscript/package-summary.html">
+  * android.renderscript</a> package reference.
+  * For more information on how to develop with RenderScript and how the runtime and
+  * Android framework APIs interact, see the <a target="_parent" href=
+  * "http://developer.android.com/guide/topics/renderscript/index.html">RenderScript
+  * developer guide</a> and the <a target="_parent" href=
+  * "http://developer.android.com/resources/samples/RenderScript/index.html">
+  * RenderScript samples</a>.
+  */
+
+/** @file rs_core.rsh
+ *  \brief todo-jsams
+ *
+ *  todo-jsams
+ *
+ */
+
+#ifndef __RS_CORE_RSH__
+#define __RS_CORE_RSH__
+
+#define _RS_RUNTIME extern
+
+#define RS_KERNEL __attribute__((kernel))
+
+#include "rs_types.rsh"
+#include "rs_allocation.rsh"
+#include "rs_atomic.rsh"
+#include "rs_core_math.rsh"
+#include "rs_debug.rsh"
+#include "rs_element.rsh"
+#include "rs_math.rsh"
+#include "rs_matrix.rsh"
+#include "rs_object.rsh"
+#include "rs_quaternion.rsh"
+#include "rs_sampler.rsh"
+#include "rs_time.rsh"
+
+/**
+ * Send a message back to the client.  Will not block and returns true
+ * if the message was sendable and false if the fifo was full.
+ * A message ID is required.  Data payload is optional.
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID, const void *data, uint len);
+/**
+ * Send a message back to the client, blocking until the message is queued.
+ * A message ID is required.  Data payload is optional.
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID, const void *data, uint len);
+
+
+/**
+ * Launch order hint for rsForEach calls.  This provides a hint to the system to
+ * determine in which order the root function of the target is called with each
+ * cell of the allocation.
+ *
+ * This is a hint and implementations may not obey the order.
+ */
+enum rs_for_each_strategy {
+    RS_FOR_EACH_STRATEGY_SERIAL = 0,
+    RS_FOR_EACH_STRATEGY_DONT_CARE = 1,
+    RS_FOR_EACH_STRATEGY_DST_LINEAR = 2,
+    RS_FOR_EACH_STRATEGY_TILE_SMALL= 3,
+    RS_FOR_EACH_STRATEGY_TILE_MEDIUM = 4,
+    RS_FOR_EACH_STRATEGY_TILE_LARGE = 5
+};
+
+
+/**
+ * Structure to provide extra information to a rsForEach call.  Primarly used to
+ * restrict the call to a subset of cells in the allocation.
+ */
+typedef struct rs_script_call {
+    enum rs_for_each_strategy strategy;
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+} rs_script_call_t;
+
+/**
+ * Make a script to script call to launch work. One of the input or output is
+ * required to be a valid object. The input and output must be of the same
+ * dimensions.
+ * API 10-13
+ *
+ * @param script The target script to call
+ * @param input The allocation to source data from
+ * @param output the allocation to write date into
+ * @param usrData The user definied params to pass to the root script.  May be
+ *                NULL.
+ * @param sc Extra control infomation used to select a sub-region of the
+ *           allocation to be processed or suggest a walking strategy.  May be
+ *           NULL.
+ *
+ *  */
+#if !defined(RS_VERSION) || (RS_VERSION < 14)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input,
+              rs_allocation output, const void * usrData,
+              const rs_script_call_t *sc);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input,
+              rs_allocation output, const void * usrData);
+#else
+
+/**
+ * Make a script to script call to launch work. One of the input or output is
+ * required to be a valid object. The input and output must be of the same
+ * dimensions.
+ * API 14+
+ *
+ * @param script The target script to call
+ * @param input The allocation to source data from
+ * @param output the allocation to write date into
+ * @param usrData The user definied params to pass to the root script.  May be
+ *                NULL.
+ * @param usrDataLen The size of the userData structure.  This will be used to
+ *                   perform a shallow copy of the data if necessary.
+ * @param sc Extra control infomation used to select a sub-region of the
+ *           allocation to be processed or suggest a walking strategy.  May be
+ *           NULL.
+ *
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output,
+              const void * usrData, size_t usrDataLen, const rs_script_call_t *);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output,
+              const void * usrData, size_t usrDataLen);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output);
+#endif
+
+
+
+#undef _RS_RUNTIME
+
+#endif

diff --git a/22.0.1/include/rs_core_math.rsh b/22.0.1/include/rs_core_math.rsh
new file mode 100644
index 0000000..287a1b9
--- /dev/null
+++ b/22.0.1/include/rs_core_math.rsh

@@ -0,0 +1,9888 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/gen_runtime.
+
+#ifndef __rs_core_math_rsh__
+#define __rs_core_math_rsh__
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar __attribute__((const, overloadable))abs(char value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))abs(char2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))abs(char3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))abs(char4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort __attribute__((const, overloadable))abs(short value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))abs(short2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))abs(short3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))abs(short4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint __attribute__((const, overloadable))abs(int value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))abs(int2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))abs(int3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))abs(int4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acos
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))acosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))acosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))acosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acosh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))acosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * acospi
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asin
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))asinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))asinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))asinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * asinh
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))asinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atan2(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atan2(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atan2(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atan2(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atan2pi(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atan2pi(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atan2pi(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atan2pi(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atanh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cbrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cbrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cbrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cbrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))ceil(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))ceil(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))ceil(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not less than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))ceil(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))clamp(float value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))clamp(float2 value, float2 min_value, float2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))clamp(float3 value, float3 min_value, float3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))clamp(float4 value, float4 min_value, float4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))clamp(float2 value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))clamp(float3 value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))clamp(float4 value, float min_value, float max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char __attribute__((const, overloadable))clamp(char value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char2 __attribute__((const, overloadable))clamp(char2 value, char2 min_value, char2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char3 __attribute__((const, overloadable))clamp(char3 value, char3 min_value, char3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char4 __attribute__((const, overloadable))clamp(char4 value, char4 min_value, char4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar __attribute__((const, overloadable))clamp(uchar value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short __attribute__((const, overloadable))clamp(short value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short2 __attribute__((const, overloadable))clamp(short2 value, short2 min_value, short2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short3 __attribute__((const, overloadable))clamp(short3 value, short3 min_value, short3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short4 __attribute__((const, overloadable))clamp(short4 value, short4 min_value, short4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort __attribute__((const, overloadable))clamp(ushort value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int __attribute__((const, overloadable))clamp(int value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int2 __attribute__((const, overloadable))clamp(int2 value, int2 min_value, int2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int3 __attribute__((const, overloadable))clamp(int3 value, int3 min_value, int3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int4 __attribute__((const, overloadable))clamp(int4 value, int4 min_value, int4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint __attribute__((const, overloadable))clamp(uint value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))clamp(uint2 value, uint2 min_value, uint2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))clamp(uint3 value, uint3 min_value, uint3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))clamp(uint4 value, uint4 min_value, uint4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long __attribute__((const, overloadable))clamp(long value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long2 __attribute__((const, overloadable))clamp(long2 value, long2 min_value, long2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long3 __attribute__((const, overloadable))clamp(long3 value, long3 min_value, long3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long4 __attribute__((const, overloadable))clamp(long4 value, long4 min_value, long4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong __attribute__((const, overloadable))clamp(ulong value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char2 __attribute__((const, overloadable))clamp(char2 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char3 __attribute__((const, overloadable))clamp(char3 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern char4 __attribute__((const, overloadable))clamp(char4 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))clamp(uchar2 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))clamp(uchar3 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))clamp(uchar4 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short2 __attribute__((const, overloadable))clamp(short2 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short3 __attribute__((const, overloadable))clamp(short3 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern short4 __attribute__((const, overloadable))clamp(short4 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))clamp(ushort2 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))clamp(ushort3 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))clamp(ushort4 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int2 __attribute__((const, overloadable))clamp(int2 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int3 __attribute__((const, overloadable))clamp(int3 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern int4 __attribute__((const, overloadable))clamp(int4 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))clamp(uint2 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))clamp(uint3 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))clamp(uint4 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long2 __attribute__((const, overloadable))clamp(long2 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long3 __attribute__((const, overloadable))clamp(long3 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern long4 __attribute__((const, overloadable))clamp(long4 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))clamp(ulong2 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))clamp(ulong3 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+/*
+ * Clamp a value to a specified high and low bound.
+ *
+ * @param amount value to be clamped.  Supports 1,2,3,4 components
+ * @param min_value Lower bound, must be scalar or matching vector.
+ * @param max_value High bound, must match type of low
+ *
+ * Supported by API versions 19 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))clamp(ulong4 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char __attribute__((const, overloadable))clz(char value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))clz(char2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))clz(char3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))clz(char4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar __attribute__((const, overloadable))clz(uchar value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))clz(uchar2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))clz(uchar3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))clz(uchar4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short __attribute__((const, overloadable))clz(short value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))clz(short2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))clz(short3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))clz(short4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort __attribute__((const, overloadable))clz(ushort value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))clz(ushort2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))clz(ushort3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))clz(ushort4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int __attribute__((const, overloadable))clz(int value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))clz(int2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))clz(int3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))clz(int4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint __attribute__((const, overloadable))clz(uint value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))clz(uint2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))clz(uint3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the number of leading 0-bits in a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))clz(uint4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to float2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to float3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to float4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to char2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to char3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to char4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to uchar2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to uchar3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to uchar4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to short2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to short3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to short4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to ushort2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to ushort3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to ushort4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to int2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to int3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to int4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from float4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from char4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uchar4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from short4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from ushort4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from int4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint2 to uint2
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint3 to uint3
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Component wise conversion from uint4 to uint4
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to float2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to float3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to float4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to float2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to float3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to float4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to float2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))convert_float2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to float3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))convert_float3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to float4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))convert_float4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to char2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to char3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to char4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to char2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to char3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to char4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to char2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))convert_char2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to char3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))convert_char3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to char4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))convert_char4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to uchar2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to uchar3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to uchar4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to uchar2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to uchar3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to uchar4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to uchar2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))convert_uchar2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to uchar3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))convert_uchar3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to uchar4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))convert_uchar4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to short2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to short3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to short4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to short2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to short3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to short4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to short2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))convert_short2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to short3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))convert_short3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to short4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))convert_short4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to ushort2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to ushort3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to ushort4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to ushort2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to ushort3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to ushort4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to ushort2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))convert_ushort2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to ushort3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))convert_ushort3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to ushort4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))convert_ushort4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to int2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to int3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to int4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to int2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to int3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to int4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to int2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))convert_int2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to int3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))convert_int3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to int4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))convert_int4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double2 to uint2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double3 to uint3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from double4 to uint4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long2 to uint2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long3 to uint3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from long4 to uint4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong2 to uint2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))convert_uint2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong3 to uint3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))convert_uint3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ulong4 to uint4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))convert_uint4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint2 to double2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double2 __attribute__((const, overloadable))convert_double2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint3 to double3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double3 __attribute__((const, overloadable))convert_double3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint4 to double4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern double4 __attribute__((const, overloadable))convert_double4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint2 to long2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))convert_long2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint3 to long3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))convert_long3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint4 to long4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))convert_long4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from float4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from char4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uchar4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from short4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from ushort4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from int4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint2 to ulong2
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))convert_ulong2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint3 to ulong3
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))convert_ulong3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Component wise conversion from uint4 to ulong4
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))convert_ulong4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))copysign(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))copysign(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))copysign(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Copy the sign bit from y to x.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))copysign(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cos(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cos(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cos(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cos(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))cospi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))cospi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cospi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cospi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the cross product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))cross(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the cross product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))cross(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))degrees(float value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))degrees(float2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))degrees(float3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from radians to degrees.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))degrees(float4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the distance between two points.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the dot product of two vectors.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))dot(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))erf(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))erf(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))erf(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))erf(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))erfc(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))erfc(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))erfc(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the complementary error function.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))erfc(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))exp(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))exp(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))exp(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return e ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))exp(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))exp10(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))exp10(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))exp10(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 10 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))exp10(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))exp2(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))exp2(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))exp2(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return 2 ^ value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))exp2(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))expm1(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))expm1(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))expm1(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))expm1(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fabs(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fabs(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fabs(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the absolute value of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fabs(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))fast_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fast_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fast_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Approximately normalize a vector.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fast_normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fdim(float a, float b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fdim(float2 a, float2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fdim(float3 a, float3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the positive difference between two values.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fdim(float4 a, float4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))floor(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))floor(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))floor(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the smallest integer not greater than a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))floor(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fma(float a, float b, float c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fma(float2 a, float2 b, float2 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fma(float3 a, float3 b, float3 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (a * b) + c.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fma(float4 a, float4 b, float4 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fmax(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmax(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmax(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmax(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmax(float2 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmax(float3 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x < y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmax(float4 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fmin(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmin(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmin(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmin(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmin(float2 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmin(float3 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x > y ? y : x)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmin(float4 x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))fmod(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))fmod(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))fmod(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the remainder from x / y
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))fmod(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))fract(float v, float* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))fract(float2 v, float2* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))fract(float3 v, float3* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * @param floor  floor[0] will be set to the floor of the input value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))fract(float4 v, float4* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float __attribute__((const, overloadable))fract(float v) {
+ float unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float2 __attribute__((const, overloadable))fract(float2 v) {
+ float2 unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float3 __attribute__((const, overloadable))fract(float3 v) {
+ float3 unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return fractional part of v
+ *
+ * Supported by API versions 9 and newer.
+ */
+static float4 __attribute__((const, overloadable))fract(float4 v) {
+ float4 unused;
+ return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))frexp(float v, int* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))frexp(float2 v, int2* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))frexp(float3 v, int3* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the mantissa and place the exponent into iptr[0]
+ *
+ * @param v Supports float, float2, float3, float4.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))frexp(float4 v, int4* iptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))half_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))half_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))half_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))half_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))half_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))half_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))half_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate value of (1.f / sqrt(value)).
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))half_rsqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float __attribute__((const, overloadable))half_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float2 __attribute__((const, overloadable))half_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float3 __attribute__((const, overloadable))half_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/*
+ * Return the approximate square root of a value.
+ *
+ * Supported by API versions 17 and newer.
+ */
+extern float4 __attribute__((const, overloadable))half_sqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))hypot(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))hypot(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))hypot(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return sqrt(x*x + y*y)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))hypot(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int __attribute__((const, overloadable))ilogb(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int2 __attribute__((const, overloadable))ilogb(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int3 __attribute__((const, overloadable))ilogb(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integer exponent of a value
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern int4 __attribute__((const, overloadable))ilogb(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))ldexp(float x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))ldexp(float2 x, int2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))ldexp(float3 x, int3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))ldexp(float4 x, int4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))ldexp(float2 x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))ldexp(float3 x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (x * 2^y)
+ *
+ * @param x Supports 1,2,3,4 components
+ * @param y Supports single component or matching vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))ldexp(float4 x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the length of a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))lgamma(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))lgamma(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))lgamma(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))lgamma(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))lgamma(float x, int* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))lgamma(float2 x, int2* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))lgamma(float3 x, int3* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the log gamma and sign
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))lgamma(float4 x, int4* y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log10(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log10(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log10(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 10 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log10(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log1p(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log1p(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log1p(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log1p(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))log2(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))log2(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))log2(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the base 2 logarithm.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))log2(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))logb(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))logb(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))logb(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the exponent of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))logb(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))mad(float a, float b, float c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))mad(float2 a, float2 b, float2 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))mad(float3 a, float3 b, float3 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute (a * b) + c
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))mad(float4 a, float4 b, float4 c);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))max(float, float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))max(float2, float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))max(float3, float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))max(float4, float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char __attribute__((const, overloadable))max(char v1, char v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar __attribute__((const, overloadable))max(uchar v1, uchar v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short __attribute__((const, overloadable))max(short v1, short v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort __attribute__((const, overloadable))max(ushort v1, ushort v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int __attribute__((const, overloadable))max(int v1, int v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint __attribute__((const, overloadable))max(uint v1, uint v2) {
+ return (v1 > v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char2 __attribute__((const, overloadable))max(char2 v1, char2 v2) {
+ char2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar2 __attribute__((const, overloadable))max(uchar2 v1, uchar2 v2) {
+ uchar2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short2 __attribute__((const, overloadable))max(short2 v1, short2 v2) {
+ short2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort2 __attribute__((const, overloadable))max(ushort2 v1, ushort2 v2) {
+ ushort2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int2 __attribute__((const, overloadable))max(int2 v1, int2 v2) {
+ int2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint2 __attribute__((const, overloadable))max(uint2 v1, uint2 v2) {
+ uint2 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char3 __attribute__((const, overloadable))max(char3 v1, char3 v2) {
+ char3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar3 __attribute__((const, overloadable))max(uchar3 v1, uchar3 v2) {
+ uchar3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short3 __attribute__((const, overloadable))max(short3 v1, short3 v2) {
+ short3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort3 __attribute__((const, overloadable))max(ushort3 v1, ushort3 v2) {
+ ushort3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int3 __attribute__((const, overloadable))max(int3 v1, int3 v2) {
+ int3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint3 __attribute__((const, overloadable))max(uint3 v1, uint3 v2) {
+ uint3 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char4 __attribute__((const, overloadable))max(char4 v1, char4 v2) {
+ char4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar4 __attribute__((const, overloadable))max(uchar4 v1, uchar4 v2) {
+ uchar4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short4 __attribute__((const, overloadable))max(short4 v1, short4 v2) {
+ short4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort4 __attribute__((const, overloadable))max(ushort4 v1, ushort4 v2) {
+ ushort4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int4 __attribute__((const, overloadable))max(int4 v1, int4 v2) {
+ int4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint4 __attribute__((const, overloadable))max(uint4 v1, uint4 v2) {
+ uint4 tmp;
+ tmp.x = (v1.x > v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y > v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z > v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w > v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char __attribute__((const, overloadable))max(char v1, char v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))max(char2 v1, char2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))max(char3 v1, char3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))max(char4 v1, char4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar __attribute__((const, overloadable))max(uchar v1, uchar v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))max(uchar2 v1, uchar2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))max(uchar3 v1, uchar3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))max(uchar4 v1, uchar4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short __attribute__((const, overloadable))max(short v1, short v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))max(short2 v1, short2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))max(short3 v1, short3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))max(short4 v1, short4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort __attribute__((const, overloadable))max(ushort v1, ushort v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))max(ushort2 v1, ushort2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))max(ushort3 v1, ushort3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))max(ushort4 v1, ushort4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int __attribute__((const, overloadable))max(int v1, int v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))max(int2 v1, int2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))max(int3 v1, int3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))max(int4 v1, int4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint __attribute__((const, overloadable))max(uint v1, uint v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))max(uint2 v1, uint2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))max(uint3 v1, uint3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))max(uint4 v1, uint4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long __attribute__((const, overloadable))max(long v1, long v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))max(long2 v1, long2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))max(long3 v1, long3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))max(long4 v1, long4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong __attribute__((const, overloadable))max(ulong v1, ulong v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))max(ulong2 v1, ulong2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))max(ulong3 v1, ulong3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the maximum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))max(ulong4 v1, ulong4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))min(float, float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))min(float2, float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))min(float3, float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))min(float4, float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char __attribute__((const, overloadable))min(char v1, char v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar __attribute__((const, overloadable))min(uchar v1, uchar v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short __attribute__((const, overloadable))min(short v1, short v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort __attribute__((const, overloadable))min(ushort v1, ushort v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int __attribute__((const, overloadable))min(int v1, int v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint __attribute__((const, overloadable))min(uint v1, uint v2) {
+ return (v1 < v2 ? v1 : v2);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char2 __attribute__((const, overloadable))min(char2 v1, char2 v2) {
+ char2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar2 __attribute__((const, overloadable))min(uchar2 v1, uchar2 v2) {
+ uchar2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short2 __attribute__((const, overloadable))min(short2 v1, short2 v2) {
+ short2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort2 __attribute__((const, overloadable))min(ushort2 v1, ushort2 v2) {
+ ushort2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int2 __attribute__((const, overloadable))min(int2 v1, int2 v2) {
+ int2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint2 __attribute__((const, overloadable))min(uint2 v1, uint2 v2) {
+ uint2 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char3 __attribute__((const, overloadable))min(char3 v1, char3 v2) {
+ char3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar3 __attribute__((const, overloadable))min(uchar3 v1, uchar3 v2) {
+ uchar3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short3 __attribute__((const, overloadable))min(short3 v1, short3 v2) {
+ short3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort3 __attribute__((const, overloadable))min(ushort3 v1, ushort3 v2) {
+ ushort3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int3 __attribute__((const, overloadable))min(int3 v1, int3 v2) {
+ int3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint3 __attribute__((const, overloadable))min(uint3 v1, uint3 v2) {
+ uint3 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static char4 __attribute__((const, overloadable))min(char4 v1, char4 v2) {
+ char4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uchar4 __attribute__((const, overloadable))min(uchar4 v1, uchar4 v2) {
+ uchar4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static short4 __attribute__((const, overloadable))min(short4 v1, short4 v2) {
+ short4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static ushort4 __attribute__((const, overloadable))min(ushort4 v1, ushort4 v2) {
+ ushort4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static int4 __attribute__((const, overloadable))min(int4 v1, int4 v2) {
+ int4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9) && (RS_VERSION <= 19))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Suppored by API versions 9 - 19
+ */
+static uint4 __attribute__((const, overloadable))min(uint4 v1, uint4 v2) {
+ uint4 tmp;
+ tmp.x = (v1.x < v2.x ? v1.x : v2.x);
+ tmp.y = (v1.y < v2.y ? v1.y : v2.y);
+ tmp.z = (v1.z < v2.z ? v1.z : v2.z);
+ tmp.w = (v1.w < v2.w ? v1.w : v2.w);
+ return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char __attribute__((const, overloadable))min(char v1, char v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char2 __attribute__((const, overloadable))min(char2 v1, char2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char3 __attribute__((const, overloadable))min(char3 v1, char3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern char4 __attribute__((const, overloadable))min(char4 v1, char4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar __attribute__((const, overloadable))min(uchar v1, uchar v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar2 __attribute__((const, overloadable))min(uchar2 v1, uchar2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar3 __attribute__((const, overloadable))min(uchar3 v1, uchar3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uchar4 __attribute__((const, overloadable))min(uchar4 v1, uchar4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short __attribute__((const, overloadable))min(short v1, short v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short2 __attribute__((const, overloadable))min(short2 v1, short2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short3 __attribute__((const, overloadable))min(short3 v1, short3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern short4 __attribute__((const, overloadable))min(short4 v1, short4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort __attribute__((const, overloadable))min(ushort v1, ushort v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort2 __attribute__((const, overloadable))min(ushort2 v1, ushort2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort3 __attribute__((const, overloadable))min(ushort3 v1, ushort3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ushort4 __attribute__((const, overloadable))min(ushort4 v1, ushort4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int __attribute__((const, overloadable))min(int v1, int v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int2 __attribute__((const, overloadable))min(int2 v1, int2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int3 __attribute__((const, overloadable))min(int3 v1, int3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern int4 __attribute__((const, overloadable))min(int4 v1, int4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint __attribute__((const, overloadable))min(uint v1, uint v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint2 __attribute__((const, overloadable))min(uint2 v1, uint2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint3 __attribute__((const, overloadable))min(uint3 v1, uint3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern uint4 __attribute__((const, overloadable))min(uint4 v1, uint4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long __attribute__((const, overloadable))min(long v1, long v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long2 __attribute__((const, overloadable))min(long2 v1, long2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long3 __attribute__((const, overloadable))min(long3 v1, long3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern long4 __attribute__((const, overloadable))min(long4 v1, long4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong __attribute__((const, overloadable))min(ulong v1, ulong v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong2 __attribute__((const, overloadable))min(ulong2 v1, ulong2 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong3 __attribute__((const, overloadable))min(ulong3 v1, ulong3 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the minimum value from two arguments
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern ulong4 __attribute__((const, overloadable))min(ulong4 v1, ulong4 v2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))mix(float start, float stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))mix(float2 start, float2 stop, float2 amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))mix(float3 start, float3 stop, float3 amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))mix(float4 start, float4 stop, float4 amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))mix(float2 start, float2 stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))mix(float3 start, float3 stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * return start + ((stop - start) * amount)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))mix(float4 start, float4 stop, float amount);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))modf(float x, float* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))modf(float2 x, float2* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))modf(float3 x, float3* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the integral and fractional components of a number.
+ *
+ * @param x Source value
+ * @param iret iret[0] will be set to the integral portion of the number.
+ * @return The floating point portion of the value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))modf(float4 x, float4* iret);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * generate a nan
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))nan(uint);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan2(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan2(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan2(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan2(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan2pi(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan2pi(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan2pi(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan2pi(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atanh(float in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atanh(float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atanh(float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atanh(float4 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cbrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cbrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cbrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cbrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cos(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cos(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cos(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cos(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cospi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cospi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cospi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cospi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_divide(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_divide(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_divide(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_divide(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_exp(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_exp(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_exp(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp
+ * valid for inputs -86.f to 86.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_exp(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_exp10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_exp10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_exp10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp10
+ * valid for inputs -37.f to 37.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_exp10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_exp2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_exp2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_exp2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate exp2
+ * valid for inputs -125.f to 125.f
+ * Max 8192 ulps of error
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_exp2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_expm1(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_expm1(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_expm1(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_expm1(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_hypot(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_hypot(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_hypot(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_hypot(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log10
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log1p(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log1p(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log1p(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log1p(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate log2
+ * It is not accurate for values very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float __attribute__((const, overloadable))native_powr(float v, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_powr(float2 v, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_powr(float3 v, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+/*
+ * Fast approximate v ^ y
+ * v must be between 0.f and 256.f
+ * y must be between -15.f and 15.f
+ * It is not accurate for values of v very close to zero.
+ *
+ * Supported by API versions 18 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_powr(float4 v, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_rsqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_rsqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_rsqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_rsqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sin(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sin(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sin(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sin(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((overloadable))native_sincos(float v, float* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((overloadable))native_sincos(float2 v, float2* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((overloadable))native_sincos(float3 v, float3* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((overloadable))native_sincos(float4 v, float4* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sinpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sinpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sinpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sinpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tan(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tan(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tan(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tan(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tanh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tanh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tanh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tanh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tanpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tanpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tanpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tanpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))nextafter(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))nextafter(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))nextafter(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the next floating point number from x towards y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))nextafter(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))pow(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))pow(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))pow(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))pow(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))pown(float x, int y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))pown(float2 x, int2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))pown(float3 x, int3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))pown(float4 x, int4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))powr(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))powr(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))powr(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return x ^ y.
+ * x must be >= 0
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))powr(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))radians(float value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))radians(float2 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))radians(float3 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Convert from degrees to radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))radians(float4 value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))remainder(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))remainder(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))remainder(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return round x/y to the nearest integer then compute the remainder.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))remainder(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))remquo(float b, float c, int* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))remquo(float2 b, float2 c, int2* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))remquo(float3 b, float3 c, int3* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the quotient and the remainder of b/c.  Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))remquo(float4 b, float4 c, int4* d);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))rint(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))rint(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))rint(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))rint(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))round(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))round(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))round(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Round to the nearest integral value.  Half values are rounded away from zero.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))round(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))rsqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))rsqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))rsqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))rsqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sign(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sign(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sign(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sign(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sin(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sin(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sin(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sin(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((overloadable))sincos(float v, float* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((overloadable))sincos(float2 v, float2* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((overloadable))sincos(float3 v, float3* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((overloadable))sincos(float4 v, float4* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sinpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sinpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sinpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sinpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))sqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))sqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))sqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the square root of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))sqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))step(float edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))step(float2 edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))step(float3 edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))step(float4 edge, float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))step(float2 edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))step(float3 edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))step(float4 edge, float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))step(float edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))step(float edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * if (v < edge)
+ * return 0.f;
+ * else
+ * return 1.f;
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))step(float edge, float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tan(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tan(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tan(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tan(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tanh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tanh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tanh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tanh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tanpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tanpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tanpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tanpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))tgamma(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))tgamma(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))tgamma(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * Compute the gamma function of a value.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))tgamma(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float __attribute__((const, overloadable))trunc(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float2 __attribute__((const, overloadable))trunc(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float3 __attribute__((const, overloadable))trunc(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 9))
+/*
+ * ound to integral using truncation.
+ *
+ * Supported by API versions 9 and newer.
+ */
+extern float4 __attribute__((const, overloadable))trunc(float4);
+#endif
+
+#endif // __rs_core_math_rsh__

diff --git a/22.0.1/include/rs_debug.rsh b/22.0.1/include/rs_debug.rsh
new file mode 100644
index 0000000..7a13c9d
--- /dev/null
+++ b/22.0.1/include/rs_debug.rsh

@@ -0,0 +1,267 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_debug.rsh
+ *  \brief Utility debugging routines
+ *
+ *  Routines intended to be used during application developement.  These should
+ *  not be used in shipping applications.  All print a string and value pair to
+ *  the standard log.
+ *
+ */
+
+#ifndef __RS_DEBUG_RSH__
+#define __RS_DEBUG_RSH__
+
+
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float, float, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float, float, float, float);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, float4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, double);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const rs_matrix4x4 *);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const rs_matrix3x3 *);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const rs_matrix2x2 *);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned long long);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, const void *);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, char4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned char);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uchar4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, short4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, unsigned short);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ushort4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, int4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, uint4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, long4);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong2);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong3);
+/**
+ * Debug function.  Prints a string and value to the log.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char *, ulong4);
+#endif  // (defined(RS_VERSION) && (RS_VERSION >= 17))
+
+#define RS_DEBUG(a) rsDebug(#a, a)
+#define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
+
+#endif

diff --git a/22.0.1/include/rs_element.rsh b/22.0.1/include/rs_element.rsh
new file mode 100644
index 0000000..0230f10
--- /dev/null
+++ b/22.0.1/include/rs_element.rsh

@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_element.rsh
+ *  \brief Element routines
+ *
+ *
+ */
+
+#ifndef __RS_ELEMENT_RSH__
+#define __RS_ELEMENT_RSH__
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Elements could be simple, such as an int or a float, or a
+ * structure with multiple sub elements, such as a collection of
+ * floats, float2, float4. This function returns zero for simple
+ * elements or the number of sub-elements otherwise.
+ *
+ * @param e element to get data from
+ * @return number of sub-elements in this element
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementCount(rs_element e);
+
+/**
+ * For complex elements, this function will return the
+ * sub-element at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element to return
+ * @return sub-element in this element at given index
+ */
+extern rs_element __attribute__((overloadable))
+    rsElementGetSubElement(rs_element, uint32_t index);
+
+/**
+ * For complex elements, this function will return the length of
+ * sub-element name at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element to return
+ * @return length of the sub-element name including the null
+ *         terminator (size of buffer needed to write the name)
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementNameLength(rs_element e, uint32_t index);
+
+/**
+ * For complex elements, this function will return the
+ * sub-element name at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element
+ * @param name array to store the name into
+ * @param nameLength length of the provided name array
+ * @return number of characters actually written, excluding the
+ *         null terminator
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementName(rs_element e, uint32_t index, char *name, uint32_t nameLength);
+
+/**
+ * For complex elements, some sub-elements could be statically
+ * sized arrays. This function will return the array size for
+ * sub-element at index
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element
+ * @return array size of sub-element in this element at given
+ *         index
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementArraySize(rs_element e, uint32_t index);
+
+/**
+ * This function specifies the location of a sub-element within
+ * the element
+ *
+ * @param e element to get data from
+ * @param index index of the sub-element
+ * @return offset in bytes of sub-element in this element at
+ *         given index
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementOffsetBytes(rs_element e, uint32_t index);
+
+/**
+ * Returns the size of element in bytes
+ *
+ * @param e element to get data from
+ * @return total size of the element in bytes
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetBytesSize(rs_element e);
+
+/**
+ * Returns the element's data type
+ *
+ * @param e element to get data from
+ * @return element's data type
+ */
+extern rs_data_type __attribute__((overloadable))
+    rsElementGetDataType(rs_element e);
+
+/**
+ * Returns the element's data kind
+ *
+ * @param e element to get data from
+ * @return element's data size
+ */
+extern rs_data_kind __attribute__((overloadable))
+    rsElementGetDataKind(rs_element e);
+
+/**
+ * Returns the element's vector size
+ *
+ * @param e element to get data from
+ * @return length of the element vector (for float2, float3,
+ *         etc.)
+ */
+extern uint32_t __attribute__((overloadable))
+    rsElementGetVectorSize(rs_element e);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_ELEMENT_RSH__
+

diff --git a/22.0.1/include/rs_graphics.rsh b/22.0.1/include/rs_graphics.rsh
new file mode 100644
index 0000000..1fcb7ed
--- /dev/null
+++ b/22.0.1/include/rs_graphics.rsh

@@ -0,0 +1,426 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_graphics.rsh
+ *  \brief RenderScript graphics API
+ *
+ *  A set of graphics functions used by RenderScript.
+ *
+ */
+#ifndef __RS_GRAPHICS_RSH__
+#define __RS_GRAPHICS_RSH__
+
+#ifdef __LP64__
+//#error "RenderScript graphics is deprecated and not supported in 64bit mode."
+#else
+
+#include "rs_mesh.rsh"
+#include "rs_program.rsh"
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+/**
+ * Set the color target used for all subsequent rendering calls
+ * @param colorTarget
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgBindColorTarget(rs_allocation colorTarget, uint slot);
+
+/**
+ * Clear the previously set color target
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgClearColorTarget(uint slot);
+
+/**
+ * Set the depth target used for all subsequent rendering calls
+ * @param depthTarget
+ */
+extern void __attribute__((overloadable))
+    rsgBindDepthTarget(rs_allocation depthTarget);
+
+/**
+ * Clear the previously set depth target
+ */
+extern void __attribute__((overloadable))
+    rsgClearDepthTarget(void);
+
+/**
+ * Clear all color and depth targets and resume rendering into
+ * the framebuffer
+ */
+extern void __attribute__((overloadable))
+    rsgClearAllRenderTargets(void);
+
+/**
+ * Force RenderScript to finish all rendering commands
+ */
+extern uint __attribute__((overloadable))
+    rsgFinish(void);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+/**
+ * Bind a new ProgramFragment to the rendering context.
+ *
+ * @param pf
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramFragment(rs_program_fragment pf);
+
+/**
+ * Bind a new ProgramStore to the rendering context.
+ *
+ * @param ps
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramStore(rs_program_store ps);
+
+/**
+ * Bind a new ProgramVertex to the rendering context.
+ *
+ * @param pv
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramVertex(rs_program_vertex pv);
+
+/**
+ * Bind a new ProgramRaster to the rendering context.
+ *
+ * @param pr
+ */
+extern void __attribute__((overloadable))
+    rsgBindProgramRaster(rs_program_raster pr);
+
+/**
+ * Bind a new Sampler object to a ProgramFragment.  The sampler will
+ * operate on the texture bound at the matching slot.
+ *
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgBindSampler(rs_program_fragment, uint slot, rs_sampler);
+
+/**
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid texture for the Program.  The sampling
+ * of the texture will be controled by the Sampler bound at the
+ * matching slot.
+ *
+ * @param slot
+ */
+extern void __attribute__((overloadable))
+    rsgBindTexture(rs_program_fragment, uint slot, rs_allocation);
+
+/**
+ * Load the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param proj projection matrix
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexLoadProjectionMatrix(const rs_matrix4x4 *proj);
+/**
+ * Load the model matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param model model matrix
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexLoadModelMatrix(const rs_matrix4x4 *model);
+/**
+ * Load the texture matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param tex texture matrix
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexLoadTextureMatrix(const rs_matrix4x4 *tex);
+/**
+ * Get the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ * @param proj matrix to store the current projection matrix into
+ */
+extern void __attribute__((overloadable))
+    rsgProgramVertexGetProjectionMatrix(rs_matrix4x4 *proj);
+
+/**
+ * Set the constant color for a fixed function emulation program.
+ *
+ * @param pf
+ * @param r
+ * @param g
+ * @param b
+ * @param a
+ */
+extern void __attribute__((overloadable))
+    rsgProgramFragmentConstantColor(rs_program_fragment pf, float r, float g, float b, float a);
+
+/**
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid constant input for the Program.
+ *
+ * @param ps program object
+ * @param slot index of the constant buffer on the program
+ * @param c constants to bind
+ */
+extern void __attribute__((overloadable))
+    rsgBindConstant(rs_program_fragment ps, uint slot, rs_allocation c);
+
+/**
+ * Bind a new Allocation object to a ProgramVertex.  The
+ * Allocation must be a valid constant input for the Program.
+ *
+ * @param pv program object
+ * @param slot index of the constant buffer on the program
+ * @param c constants to bind
+ */
+extern void __attribute__((overloadable))
+    rsgBindConstant(rs_program_vertex pv, uint slot, rs_allocation c);
+
+/**
+ * Get the width of the current rendering surface.
+ *
+ * @return uint
+ */
+extern uint __attribute__((overloadable))
+    rsgGetWidth(void);
+
+/**
+ * Get the height of the current rendering surface.
+ *
+ * @return uint
+ */
+extern uint __attribute__((overloadable))
+    rsgGetHeight(void);
+
+
+/**
+ * Sync the contents of an allocation from its SCRIPT memory space to its HW
+ * memory spaces.
+ *
+ * @param alloc
+ */
+extern void __attribute__((overloadable))
+    rsgAllocationSyncAll(rs_allocation alloc);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * Sync the contents of an allocation from memory space
+ * specified by source.
+ *
+ * @param alloc
+ * @param source
+ */
+extern void __attribute__((overloadable))
+    rsgAllocationSyncAll(rs_allocation alloc,
+                         rs_allocation_usage_type source);
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+/**
+ * Low performance utility function for drawing a simple rectangle.  Not
+ * intended for drawing large quantities of geometry.
+ *
+ * @param x1
+ * @param y1
+ * @param x2
+ * @param y2
+ * @param z
+ */
+extern void __attribute__((overloadable))
+    rsgDrawRect(float x1, float y1, float x2, float y2, float z);
+
+/**
+ * Low performance utility function for drawing a simple quad.  Not intended for
+ * drawing large quantities of geometry.
+ *
+ * @param x1
+ * @param y1
+ * @param z1
+ * @param x2
+ * @param y2
+ * @param z2
+ * @param x3
+ * @param y3
+ * @param z3
+ * @param x4
+ * @param y4
+ * @param z4
+ */
+extern void __attribute__((overloadable))
+    rsgDrawQuad(float x1, float y1, float z1,
+                float x2, float y2, float z2,
+                float x3, float y3, float z3,
+                float x4, float y4, float z4);
+
+
+/**
+ * Low performance utility function for drawing a textured quad.  Not intended
+ * for drawing large quantities of geometry.
+ *
+ * @param x1
+ * @param y1
+ * @param z1
+ * @param u1
+ * @param v1
+ * @param x2
+ * @param y2
+ * @param z2
+ * @param u2
+ * @param v2
+ * @param x3
+ * @param y3
+ * @param z3
+ * @param u3
+ * @param v3
+ * @param x4
+ * @param y4
+ * @param z4
+ * @param u4
+ * @param v4
+ */
+extern void __attribute__((overloadable))
+    rsgDrawQuadTexCoords(float x1, float y1, float z1, float u1, float v1,
+                         float x2, float y2, float z2, float u2, float v2,
+                         float x3, float y3, float z3, float u3, float v3,
+                         float x4, float y4, float z4, float u4, float v4);
+
+
+/**
+ * Low performance function for drawing rectangles in screenspace.  This
+ * function uses the default passthough ProgramVertex.  Any bound ProgramVertex
+ * is ignored.  This function has considerable overhead and should not be used
+ * for drawing in shipping applications.
+ *
+ * @param x
+ * @param y
+ * @param z
+ * @param w
+ * @param h
+ */
+extern void __attribute__((overloadable))
+    rsgDrawSpriteScreenspace(float x, float y, float z, float w, float h);
+
+extern void __attribute__((overloadable))
+    rsgDrawPath(rs_path p);
+
+/**
+ * Draw a mesh using the current context state.  The whole mesh is
+ * rendered.
+ *
+ * @param ism
+ */
+extern void __attribute__((overloadable))
+    rsgDrawMesh(rs_mesh ism);
+/**
+ * Draw part of a mesh using the current context state.
+ * @param ism mesh object to render
+ * @param primitiveIndex for meshes that contain multiple primitive groups
+ *        this parameter specifies the index of the group to draw.
+ */
+extern void __attribute__((overloadable))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex);
+/**
+ * Draw specified index range of part of a mesh using the current context state.
+ * @param ism mesh object to render
+ * @param primitiveIndex for meshes that contain multiple primitive groups
+ *        this parameter specifies the index of the group to draw.
+ * @param start starting index in the range
+ * @param len number of indices to draw
+ */
+extern void __attribute__((overloadable))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex, uint start, uint len);
+
+/**
+ * Clears the rendering surface to the specified color.
+ *
+ * @param r
+ * @param g
+ * @param b
+ * @param a
+ */
+extern void __attribute__((overloadable))
+    rsgClearColor(float r, float g, float b, float a);
+
+/**
+ * Clears the depth suface to the specified value.
+ */
+extern void __attribute__((overloadable))
+    rsgClearDepth(float value);
+/**
+ * Draws text given a string and location
+ */
+extern void __attribute__((overloadable))
+    rsgDrawText(const char *, int x, int y);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsgDrawText(rs_allocation, int x, int y);
+/**
+ * Binds the font object to be used for all subsequent font rendering calls
+ * @param font object to bind
+ */
+extern void __attribute__((overloadable))
+    rsgBindFont(rs_font font);
+/**
+ * Sets the font color for all subsequent rendering calls
+ * @param r red component
+ * @param g green component
+ * @param b blue component
+ * @param a alpha component
+ */
+extern void __attribute__((overloadable))
+    rsgFontColor(float r, float g, float b, float a);
+/**
+ * Returns the bounding box of the text relative to (0, 0)
+ * Any of left, right, top, bottom could be NULL
+ */
+extern void __attribute__((overloadable))
+    rsgMeasureText(const char *, int *left, int *right, int *top, int *bottom);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsgMeasureText(rs_allocation, int *left, int *right, int *top, int *bottom);
+/**
+ * Computes an axis aligned bounding box of a mesh object
+ */
+extern void __attribute__((overloadable))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float *minX, float *minY, float *minZ,
+                                                float *maxX, float *maxY, float *maxZ);
+/**
+ * \overload
+ */
+__inline__ static void __attribute__((overloadable, always_inline))
+rsgMeshComputeBoundingBox(rs_mesh mesh, float3 *bBoxMin, float3 *bBoxMax) {
+    float x1, y1, z1, x2, y2, z2;
+    rsgMeshComputeBoundingBox(mesh, &x1, &y1, &z1, &x2, &y2, &z2);
+    bBoxMin->x = x1;
+    bBoxMin->y = y1;
+    bBoxMin->z = z1;
+    bBoxMax->x = x2;
+    bBoxMax->y = y2;
+    bBoxMax->z = z2;
+}
+
+#endif //__LP64__
+#endif
+

diff --git a/22.0.1/include/rs_math.rsh b/22.0.1/include/rs_math.rsh
new file mode 100644
index 0000000..4d3124c
--- /dev/null
+++ b/22.0.1/include/rs_math.rsh

@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_math.rsh
+ *  \brief todo-jsams
+ *
+ *  todo-jsams
+ *
+ */
+
+#ifndef __RS_MATH_RSH__
+#define __RS_MATH_RSH__
+
+
+/**
+ * Return a random value between 0 (or min_value) and max_malue.
+ */
+extern int __attribute__((overloadable))
+    rsRand(int max_value);
+/**
+ * \overload
+ */
+extern int __attribute__((overloadable))
+    rsRand(int min_value, int max_value);
+/**
+ * \overload
+ */
+extern float __attribute__((overloadable))
+    rsRand(float max_value);
+/**
+ * \overload
+ */
+extern float __attribute__((overloadable))
+    rsRand(float min_value, float max_value);
+
+/**
+ * Returns the fractional part of a float
+ */
+extern float __attribute__((const, overloadable))
+    rsFrac(float);
+
+
+/////////////////////////////////////////////////////
+// int ops
+/////////////////////////////////////////////////////
+
+/**
+ * Clamp the value amount between low and high.
+ *
+ * @param amount  The value to clamp
+ * @param low
+ * @param high
+ */
+_RS_RUNTIME uint __attribute__((const, overloadable, always_inline)) rsClamp(uint amount, uint low, uint high);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME int __attribute__((const, overloadable, always_inline)) rsClamp(int amount, int low, int high);
+/**
+ * \overload
+ */
+_RS_RUNTIME ushort __attribute__((const, overloadable, always_inline)) rsClamp(ushort amount, ushort low, ushort high);
+/**
+ * \overload
+ */
+_RS_RUNTIME short __attribute__((const, overloadable, always_inline)) rsClamp(short amount, short low, short high);
+/**
+ * \overload
+ */
+_RS_RUNTIME uchar __attribute__((const, overloadable, always_inline)) rsClamp(uchar amount, uchar low, uchar high);
+/**
+ * \overload
+ */
+_RS_RUNTIME char __attribute__((const, overloadable, always_inline)) rsClamp(char amount, char low, char high);
+
+
+/**
+ * Computes 6 frustum planes from the view projection matrix
+ * @param viewProj matrix to extract planes from
+ * @param left plane
+ * @param right plane
+ * @param top plane
+ * @param bottom plane
+ * @param near plane
+ * @param far plane
+ */
+__inline__ static void __attribute__((overloadable, always_inline))
+rsExtractFrustumPlanes(const rs_matrix4x4 *viewProj,
+                         float4 *left, float4 *right,
+                         float4 *top, float4 *bottom,
+                         float4 *near, float4 *far) {
+    // x y z w = a b c d in the plane equation
+    left->x = viewProj->m[3] + viewProj->m[0];
+    left->y = viewProj->m[7] + viewProj->m[4];
+    left->z = viewProj->m[11] + viewProj->m[8];
+    left->w = viewProj->m[15] + viewProj->m[12];
+
+    right->x = viewProj->m[3] - viewProj->m[0];
+    right->y = viewProj->m[7] - viewProj->m[4];
+    right->z = viewProj->m[11] - viewProj->m[8];
+    right->w = viewProj->m[15] - viewProj->m[12];
+
+    top->x = viewProj->m[3] - viewProj->m[1];
+    top->y = viewProj->m[7] - viewProj->m[5];
+    top->z = viewProj->m[11] - viewProj->m[9];
+    top->w = viewProj->m[15] - viewProj->m[13];
+
+    bottom->x = viewProj->m[3] + viewProj->m[1];
+    bottom->y = viewProj->m[7] + viewProj->m[5];
+    bottom->z = viewProj->m[11] + viewProj->m[9];
+    bottom->w = viewProj->m[15] + viewProj->m[13];
+
+    near->x = viewProj->m[3] + viewProj->m[2];
+    near->y = viewProj->m[7] + viewProj->m[6];
+    near->z = viewProj->m[11] + viewProj->m[10];
+    near->w = viewProj->m[15] + viewProj->m[14];
+
+    far->x = viewProj->m[3] - viewProj->m[2];
+    far->y = viewProj->m[7] - viewProj->m[6];
+    far->z = viewProj->m[11] - viewProj->m[10];
+    far->w = viewProj->m[15] - viewProj->m[14];
+
+    float len = length(left->xyz);
+    *left /= len;
+    len = length(right->xyz);
+    *right /= len;
+    len = length(top->xyz);
+    *top /= len;
+    len = length(bottom->xyz);
+    *bottom /= len;
+    len = length(near->xyz);
+    *near /= len;
+    len = length(far->xyz);
+    *far /= len;
+}
+
+/**
+ * Checks if a sphere is withing the 6 frustum planes
+ * @param sphere float4 representing the sphere
+ * @param left plane
+ * @param right plane
+ * @param top plane
+ * @param bottom plane
+ * @param near plane
+ * @param far plane
+ */
+__inline__ static bool __attribute__((overloadable, always_inline))
+rsIsSphereInFrustum(float4 *sphere,
+                      float4 *left, float4 *right,
+                      float4 *top, float4 *bottom,
+                      float4 *near, float4 *far) {
+
+    float distToCenter = dot(left->xyz, sphere->xyz) + left->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(right->xyz, sphere->xyz) + right->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(top->xyz, sphere->xyz) + top->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(bottom->xyz, sphere->xyz) + bottom->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(near->xyz, sphere->xyz) + near->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(far->xyz, sphere->xyz) + far->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    return true;
+}
+
+
+/**
+ * Pack floating point (0-1) RGB values into a uchar4.  The alpha component is
+ * set to 255 (1.0).
+ *
+ * @param r
+ * @param g
+ * @param b
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float r, float g, float b);
+
+/**
+ * Pack floating point (0-1) RGBA values into a uchar4.
+ *
+ * @param r
+ * @param g
+ * @param b
+ * @param a
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float r, float g, float b, float a);
+
+/**
+ * Pack floating point (0-1) RGB values into a uchar4.  The alpha component is
+ * set to 255 (1.0).
+ *
+ * @param color
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float3 color);
+
+/**
+ * Pack floating point (0-1) RGBA values into a uchar4.
+ *
+ * @param color
+ *
+ * @return uchar4
+ */
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsPackColorTo8888(float4 color);
+
+/**
+ * Unpack a uchar4 color to float4.  The resulting float range will be (0-1).
+ *
+ * @param c
+ *
+ * @return float4
+ */
+_RS_RUNTIME float4 __attribute__((const)) rsUnpackColor8888(uchar4 c);
+
+_RS_RUNTIME uchar4 __attribute__((const, overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+_RS_RUNTIME float4 __attribute__((const, overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
+
+#endif

diff --git a/22.0.1/include/rs_matrix.rsh b/22.0.1/include/rs_matrix.rsh
new file mode 100644
index 0000000..34b9532
--- /dev/null
+++ b/22.0.1/include/rs_matrix.rsh

@@ -0,0 +1,532 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_matrix.rsh
+ *  \brief Matrix functions.
+ *
+ * These functions let you manipulate square matrices of rank 2x2, 3x3, and 4x4.
+ * They are particularly useful for graphical transformations and are
+ * compatible with OpenGL.
+ *
+ * A few general notes:
+ *
+ * \li We use a zero-based index for rows and columns.  E.g. the last element of
+ * a \ref rs_matrix4x4 is found at (3, 3).
+ *
+ * \li RenderScript uses column-based vectors.  Transforming a vector is done by
+ * postmultiplying the vector, e.g. <em>(matrix * vector)</em>, as provided by
+ * \ref rsMatrixMultiply.
+ *
+ * \li To create a transformation matrix that performs two transformations at
+ * once, multiply the two source matrices, with the first transformation as the
+ * right argument.  E.g. to create a transformation matrix that applies the
+ * transformation \e s1 followed by \e s2, call
+ * </c>rsMatrixLoadMultiply(&combined, &s2, &s1)</c>.
+ * This derives from <em>s2 * (s1 * v)</em>, which is <em>(s2 * s1) * v</em>.
+ *
+ * \li We have two style of functions to create transformation matrices:
+ * rsMatrixLoad<em>Transformation</em> and rsMatrix<em>Transformation</em>.  The
+ * former style simply stores the transformation matrix in the first argument.
+ * The latter modifies a pre-existing transformation matrix so that the new
+ * transformation happens first.  E.g. if you call \ref rsMatrixTranslate
+ * on a matrix that already does a scaling, the resulting matrix when applied
+ * to a vector will first do the translation then the scaling.
+ *
+ */
+
+#ifndef __RS_MATRIX_RSH__
+#define __RS_MATRIX_RSH__
+
+/**
+ * Set an element of a matrix.
+ *
+ * @param m The matrix that will be modified.
+ * @param col The zero-based column of the element to be set.
+ * @param row The zero-based row of the element to be set.
+ * @param v The value to set.
+ *
+ * \warning The order of the column and row parameters may be
+ * unexpected.
+ *
+ * @return void
+ */
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix4x4 *m, uint32_t col, uint32_t row, float v);
+/**
+ * \overload
+ */
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix3x3 *m, uint32_t col, uint32_t row, float v);
+/**
+ * \overload
+ */
+_RS_RUNTIME void __attribute__((overloadable))
+rsMatrixSet(rs_matrix2x2 *m, uint32_t col, uint32_t row, float v);
+
+/**
+ * Returns one element of a matrix.
+ *
+ * @param m The matrix to extract the element from.
+ * @param col The zero-based column of the element to be extracted.
+ * @param row The zero-based row of the element to extracted.
+ *
+ * \warning The order of the column and row parameters may be
+ * unexpected.
+ *
+ * @return float
+ */
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix4x4 *m, uint32_t col, uint32_t row);
+/**
+ * \overload
+ */
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix3x3 *m, uint32_t col, uint32_t row);
+/**
+ * \overload
+ */
+_RS_RUNTIME float __attribute__((overloadable))
+rsMatrixGet(const rs_matrix2x2 *m, uint32_t col, uint32_t row);
+
+/**
+ * Set the elements of a matrix to the identity matrix.
+ *
+ * @param m The matrix to set.
+ */
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix4x4 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix3x3 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix2x2 *m);
+
+/**
+ * Set the elements of a matrix from an array of floats.
+ *
+ * The array of floats should be in row-major order, i.e. the element a
+ * <em>row 0, column 0</em> should be first, followed by the element at
+ * <em>row 0, column 1</em>, etc.
+ *
+ * @param m The matrix to set.
+ * @param v The array of values to set the matrix to. These arrays should be
+ * 4, 9, or 16 floats long, depending on the matrix size.
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const float *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix3x3 *m, const float *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const float *v);
+/**
+ * Set the elements of a matrix from another matrix.
+ *
+ * If the source matrix is smaller than the destination, the rest of the
+ * destination is filled with elements of the identity matrix.  E.g.
+ * loading a rs_matrix2x2 into a rs_matrix4x4 will give:
+ *
+ * \htmlonly<table>
+ * <tr><td>m00</td><td>m01</td><td>0.0</td><td>0.0</td></tr>
+ * <tr><td>m10</td><td>m11</td><td>0.0</td><td>0.0</td></tr>
+ * <tr><td>0.0</td><td>0.0</td><td>1.0</td><td>0.0</td></tr>
+ * <tr><td>0.0</td><td>0.0</td><td>0.0</td><td>1.0</td></tr>
+ * </table>\endhtmlonly
+ *
+ * @param m The matrix to set.
+ * @param v The source matrix.
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix4x4 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix3x3 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix2x2 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix3x3 *m, const rs_matrix3x3 *v);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const rs_matrix2x2 *v);
+
+/**
+ * Load a rotation matrix.
+ *
+ * This function creates a rotation matrix.  The axis of rotation is the
+ * <em>(x, y, z)</em> vector.
+ *
+ * To rotate a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * See http://en.wikipedia.org/wiki/Rotation_matrix .
+ *
+ * @param m The matrix to set.
+ * @param rot How much rotation to do, in degrees.
+ * @param x The x component of the vector that is the axis of rotation.
+ * @param y The y component of the vector that is the axis of rotation.
+ * @param z The z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
+
+/**
+ * Load a scale matrix.
+ *
+ * This function creates a scaling matrix, where each component of a
+ * vector is multiplied by a number.  This number can be negative.
+ *
+ * To scale a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param x The multiple to scale the x components by.
+ * @param y The multiple to scale the y components by.
+ * @param z The multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadScale(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Load a translation matrix.
+ *
+ * This function creates a translation matrix, where a
+ * number is added to each element of a vector.
+ *
+ * To translate a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param x The number to add to each x component.
+ * @param y The number to add to each y component.
+ * @param z The number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadTranslate(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Multiply two matrices.
+ *
+ * Sets \e m to the matrix product of <em>lhs * rhs</em>.
+ *
+ * To combine two 4x4 transformaton matrices, multiply the second transformation matrix
+ * by the first transformation matrix.  E.g. to create a transformation matrix that applies
+ * the transformation \e s1 followed by \e s2, call
+ * </c>rsMatrixLoadMultiply(&combined, &s2, &s1)</c>.
+ *
+ * \warning Prior to version 21, storing the result back into right matrix is not supported and
+ * will result in undefined behavior.  Use rsMatrixMulitply instead.   E.g. instead of doing
+ * rsMatrixLoadMultiply (&m2r, &m2r, &m2l), use rsMatrixMultiply (&m2r, &m2l).
+ * rsMatrixLoadMultiply (&m2l, &m2r, &m2l) works as expected.
+ *
+ * @param m The matrix to set.
+ * @param lhs The left matrix of the product.
+ * @param rhs The right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs);
+
+/**
+ * Multiply a matrix into another one.
+ *
+ * Sets \e m to the matrix product <em>m * rhs</em>.
+ *
+ * When combining two 4x4 transformation matrices using this function, the resulting
+ * matrix will correspond to performing the \e rhs transformation first followed by
+ * the original \e m transformation.
+ *
+ * @param m The left matrix of the product and the matrix to be set.
+ * @param rhs The right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, const rs_matrix3x3 *rhs);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *rhs);
+
+/**
+ * Multiply the matrix \e m with a rotation matrix.
+ *
+ * This function modifies a transformation matrix to first do a rotation.
+ * The axis of rotation is the <em>(x, y, z)</em> vector.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param rot How much rotation to do, in degrees.
+ * @param x The x component of the vector that is the axis of rotation.
+ * @param y The y component of the vector that is the axis of rotation.
+ * @param z The z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+rsMatrixRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
+
+/**
+ * Multiply the matrix \e m with a scaling matrix.
+ *
+ * This function modifies a transformation matrix to first do a scaling.
+ * When scaling, each component of a vector is multiplied by a number.
+ * This number can be negative.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param x The multiple to scale the x components by.
+ * @param y The multiple to scale the y components by.
+ * @param z The multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+rsMatrixScale(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Multiply the matrix \e m with a translation matrix.
+ *
+ * This function modifies a transformation matrix to first
+ * do a translation.  When translating, a number is added
+ * to each component of a vector.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param x The number to add to each x component.
+ * @param y The number to add to each y component.
+ * @param z The number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+rsMatrixTranslate(rs_matrix4x4 *m, float x, float y, float z);
+
+/**
+ * Load an orthographic projection matrix.
+ *
+ * Constructs an orthographic projection matrix, transforming the box
+ * identified by the six clipping planes <em>left, right, bottom, top,
+ * near, far</em> into a unit cube with a corner at
+ * <em>(-1, -1, -1)</em> and the opposite at <em>(1, 1, 1)</em>.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * See https://en.wikipedia.org/wiki/Orthographic_projection .
+ *
+ * @param m The matrix to set.
+ * @param left
+ * @param right
+ * @param bottom
+ * @param top
+ * @param near
+ * @param far
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadOrtho(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
+
+/**
+ * Load a frustum projection matrix.
+ *
+ * Constructs a frustum projection matrix, transforming the box
+ * identified by the six clipping planes <em>left, right, bottom, top,
+ * near, far</em>.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param left
+ * @param right
+ * @param bottom
+ * @param top
+ * @param near
+ * @param far
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadFrustum(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
+
+/**
+ * Load a perspective projection matrix.
+ *
+ * Constructs a perspective projection matrix, assuming a symmetrical field of view.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param fovy Field of view, in degrees along the Y axis.
+ * @param aspect Ratio of x / y.
+ * @param near The near clipping plane.
+ * @param far The far clipping plane.
+ */
+extern void __attribute__((overloadable))
+rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+#if !defined(RS_VERSION) || (RS_VERSION < 14)
+/**
+ * Multiply a vector by a matrix.
+ *
+ * Returns the post-multiplication of the vector by the matrix, ie. <em>m * in</em>.
+ *
+ * When multiplying a \e float3 to a \e rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix3x3, the vector is expanded with (0).
+ *
+ * This function is available in API version 10-13.  Starting with API 14,
+ * the function takes a const matrix as the first argument.
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float4 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix4x4 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix3x3 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float2 __attribute__((overloadable))
+rsMatrixMultiply(rs_matrix2x2 *m, float2 in);
+#else
+/**
+ * Multiply a vector by a matrix.
+ *
+ * Returns the post-multiplication of the vector of the matrix, i.e. <em>m * in</em>.
+ *
+ * When multiplying a \e float3 to a \e rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix3x3, the vector is expanded with (0).
+ *
+ * This function is available starting with API version 14.
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix4x4 *m, float4 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix4x4 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float4 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix4x4 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix3x3 *m, float3 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float3 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix3x3 *m, float2 in);
+
+/**
+ * \overload
+ */
+_RS_RUNTIME float2 __attribute__((overloadable))
+rsMatrixMultiply(const rs_matrix2x2 *m, float2 in);
+#endif
+
+
+/**
+ * Inverts a matrix in place.
+ *
+ * Returns true if the matrix was successfully inverted.
+ *
+ * @param m The matrix to invert.
+ */
+extern bool __attribute__((overloadable)) rsMatrixInverse(rs_matrix4x4 *m);
+
+/**
+ * Inverts and transpose a matrix in place.
+ *
+ * The matrix is first inverted then transposed.
+ * Returns true if the matrix was successfully inverted.
+ *
+ * @param m The matrix to modify.
+ */
+extern bool __attribute__((overloadable)) rsMatrixInverseTranspose(rs_matrix4x4 *m);
+
+/**
+ * Transpose the matrix m in place.
+ *
+ * @param m The matrix to transpose.
+ */
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix4x4 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix3x3 *m);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix2x2 *m);
+
+
+#endif

diff --git a/22.0.1/include/rs_mesh.rsh b/22.0.1/include/rs_mesh.rsh
new file mode 100644
index 0000000..0ecd786
--- /dev/null
+++ b/22.0.1/include/rs_mesh.rsh

@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_mesh.rsh
+ *  \brief Mesh routines
+ *
+ *
+ */
+
+#ifndef __RS_MESH_RSH__
+#define __RS_MESH_RSH__
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Returns the number of allocations in the mesh that contain
+ * vertex data
+ *
+ * @param m mesh to get data from
+ * @return number of allocations in the mesh that contain vertex
+ *         data
+ */
+extern uint32_t __attribute__((overloadable))
+    rsgMeshGetVertexAllocationCount(rs_mesh m);
+
+/**
+ * Meshes could have multiple index sets, this function returns
+ * the number.
+ *
+ * @param m mesh to get data from
+ * @return number of primitive groups in the mesh. This would
+ *         include simple primitives as well as allocations
+ *         containing index data
+ */
+extern uint32_t __attribute__((overloadable))
+    rsgMeshGetPrimitiveCount(rs_mesh m);
+
+/**
+ * Returns an allocation that is part of the mesh and contains
+ * vertex data, e.g. positions, normals, texcoords
+ *
+ * @param m mesh to get data from
+ * @param index index of the vertex allocation
+ * @return allocation containing vertex data
+ */
+extern rs_allocation __attribute__((overloadable))
+    rsgMeshGetVertexAllocation(rs_mesh m, uint32_t index);
+
+/**
+ * Returns an allocation containing index data or a null
+ * allocation if only the primitive is specified
+ *
+ * @param m mesh to get data from
+ * @param index index of the index allocation
+ * @return allocation containing index data
+ */
+extern rs_allocation __attribute__((overloadable))
+    rsgMeshGetIndexAllocation(rs_mesh m, uint32_t index);
+
+/**
+ * Returns the primitive describing how a part of the mesh is
+ * rendered
+ *
+ * @param m mesh to get data from
+ * @param index index of the primitive
+ * @return primitive describing how the mesh is rendered
+ */
+extern rs_primitive __attribute__((overloadable))
+    rsgMeshGetPrimitive(rs_mesh m, uint32_t index);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_MESH_RSH__
+

diff --git a/22.0.1/include/rs_object.rsh b/22.0.1/include/rs_object.rsh
new file mode 100644
index 0000000..ed6423b
--- /dev/null
+++ b/22.0.1/include/rs_object.rsh

@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_object.rsh
+ *  \brief Object routines
+ *
+ *
+ */
+
+#ifndef __RS_OBJECT_RSH__
+#define __RS_OBJECT_RSH__
+
+
+/**
+ * Copy reference to the specified object.
+ *
+ * @param dst
+ * @param src
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_element *dst, rs_element src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_type *dst, rs_type src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_allocation *dst, rs_allocation src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_sampler *dst, rs_sampler src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_script *dst, rs_script src);
+
+#ifndef __LP64__
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_path *dst, rs_path src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_mesh *dst, rs_mesh src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_fragment *dst, rs_program_fragment src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_vertex *dst, rs_program_vertex src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_raster *dst, rs_program_raster src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_store *dst, rs_program_store src);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_font *dst, rs_font src);
+#endif //__LP64__
+
+/**
+ * Sets the object to NULL.
+ *
+ * @return bool
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_element *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_type *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_allocation *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_sampler *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_script *dst);
+
+
+#ifndef __LP64__
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_path *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_mesh *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_fragment *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_vertex *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_raster *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_store *dst);
+/**
+ * \overload
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_font *dst);
+#endif //__LP64__
+
+
+/**
+ * Tests if the object is valid.  Returns true if the object is valid, false if
+ * it is NULL.
+ *
+ * @return bool
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_element);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_type);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_allocation);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_sampler);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_script);
+
+#ifndef __LP64__
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_path);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_mesh);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_fragment);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_vertex);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_raster);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_store);
+/**
+ * \overload
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_font);
+#endif //__LP64__
+
+#endif

diff --git a/22.0.1/include/rs_program.rsh b/22.0.1/include/rs_program.rsh
new file mode 100644
index 0000000..299aae6
--- /dev/null
+++ b/22.0.1/include/rs_program.rsh

@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_program.rsh
+ *  \brief Program object routines
+ *
+ *
+ */
+
+#ifndef __RS_PROGRAM_RSH__
+#define __RS_PROGRAM_RSH__
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Get program store depth function
+ *
+ * @param ps program store to query
+ */
+extern rs_depth_func __attribute__((overloadable))
+    rsgProgramStoreGetDepthFunc(rs_program_store ps);
+
+/**
+ * Get program store depth mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsDepthMaskEnabled(rs_program_store ps);
+/**
+ * Get program store red component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskRedEnabled(rs_program_store ps);
+
+/**
+ * Get program store green component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskGreenEnabled(rs_program_store ps);
+
+/**
+ * Get program store blur component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskBlueEnabled(rs_program_store ps);
+
+/**
+ * Get program store alpha component color mask
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsColorMaskAlphaEnabled(rs_program_store ps);
+
+/**
+ * Get program store blend source function
+ *
+ * @param ps program store to query
+ */
+extern rs_blend_src_func __attribute__((overloadable))
+        rsgProgramStoreGetBlendSrcFunc(rs_program_store ps);
+
+/**
+ * Get program store blend destination function
+ *
+ * @param ps program store to query
+ */
+extern rs_blend_dst_func __attribute__((overloadable))
+    rsgProgramStoreGetBlendDstFunc(rs_program_store ps);
+
+/**
+ * Get program store dither state
+ *
+ * @param ps program store to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramStoreIsDitherEnabled(rs_program_store ps);
+
+/**
+ * Get program raster point sprite state
+ *
+ * @param pr program raster to query
+ */
+extern bool __attribute__((overloadable))
+    rsgProgramRasterIsPointSpriteEnabled(rs_program_raster pr);
+
+/**
+ * Get program raster cull mode
+ *
+ * @param pr program raster to query
+ */
+extern rs_cull_mode __attribute__((overloadable))
+    rsgProgramRasterGetCullMode(rs_program_raster pr);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_PROGRAM_RSH__
+

diff --git a/22.0.1/include/rs_quaternion.rsh b/22.0.1/include/rs_quaternion.rsh
new file mode 100644
index 0000000..4e08d2f
--- /dev/null
+++ b/22.0.1/include/rs_quaternion.rsh

@@ -0,0 +1,253 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_quaternion.rsh
+ *  \brief Quaternion routines
+ *
+ *
+ */
+
+#ifndef __RS_QUATERNION_RSH__
+#define __RS_QUATERNION_RSH__
+
+
+/**
+ * Set the quaternion components
+ * @param w component
+ * @param x component
+ * @param y component
+ * @param z component
+ */
+static void __attribute__((overloadable))
+rsQuaternionSet(rs_quaternion *q, float w, float x, float y, float z) {
+    q->w = w;
+    q->x = x;
+    q->y = y;
+    q->z = z;
+}
+
+/**
+ * Set the quaternion from another quaternion
+ * @param q destination quaternion
+ * @param rhs source quaternion
+ */
+static void __attribute__((overloadable))
+rsQuaternionSet(rs_quaternion *q, const rs_quaternion *rhs) {
+    q->w = rhs->w;
+    q->x = rhs->x;
+    q->y = rhs->y;
+    q->z = rhs->z;
+}
+
+/**
+ * Multiply quaternion by a scalar
+ * @param q quaternion to multiply
+ * @param s scalar
+ */
+static void __attribute__((overloadable))
+rsQuaternionMultiply(rs_quaternion *q, float s) {
+    q->w *= s;
+    q->x *= s;
+    q->y *= s;
+    q->z *= s;
+}
+
+/**
+ * Add two quaternions
+ * @param q destination quaternion to add to
+ * @param rsh right hand side quaternion to add
+ */
+static void
+rsQuaternionAdd(rs_quaternion *q, const rs_quaternion *rhs) {
+    q->w *= rhs->w;
+    q->x *= rhs->x;
+    q->y *= rhs->y;
+    q->z *= rhs->z;
+}
+
+/**
+ * Loads a quaternion that represents a rotation about an arbitrary unit vector
+ * @param q quaternion to set
+ * @param rot angle to rotate by
+ * @param x component of a vector
+ * @param y component of a vector
+ * @param x component of a vector
+ */
+static void
+rsQuaternionLoadRotateUnit(rs_quaternion *q, float rot, float x, float y, float z) {
+    rot *= (float)(M_PI / 180.0f) * 0.5f;
+    float c = cos(rot);
+    float s = sin(rot);
+
+    q->w = c;
+    q->x = x * s;
+    q->y = y * s;
+    q->z = z * s;
+}
+
+/**
+ * Loads a quaternion that represents a rotation about an arbitrary vector
+ * (doesn't have to be unit)
+ * @param q quaternion to set
+ * @param rot angle to rotate by
+ * @param x component of a vector
+ * @param y component of a vector
+ * @param x component of a vector
+ */
+static void
+rsQuaternionLoadRotate(rs_quaternion *q, float rot, float x, float y, float z) {
+    const float len = x*x + y*y + z*z;
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        x *= recipLen;
+        y *= recipLen;
+        z *= recipLen;
+    }
+    rsQuaternionLoadRotateUnit(q, rot, x, y, z);
+}
+
+/**
+ * Conjugates the quaternion
+ * @param q quaternion to conjugate
+ */
+static void
+rsQuaternionConjugate(rs_quaternion *q) {
+    q->x = -q->x;
+    q->y = -q->y;
+    q->z = -q->z;
+}
+
+/**
+ * Dot product of two quaternions
+ * @param q0 first quaternion
+ * @param q1 second quaternion
+ * @return dot product between q0 and q1
+ */
+static float
+rsQuaternionDot(const rs_quaternion *q0, const rs_quaternion *q1) {
+    return q0->w*q1->w + q0->x*q1->x + q0->y*q1->y + q0->z*q1->z;
+}
+
+/**
+ * Normalizes the quaternion
+ * @param q quaternion to normalize
+ */
+static void
+rsQuaternionNormalize(rs_quaternion *q) {
+    const float len = rsQuaternionDot(q, q);
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        rsQuaternionMultiply(q, recipLen);
+    }
+}
+
+/**
+ * Multiply quaternion by another quaternion
+ * @param q destination quaternion
+ * @param rhs right hand side quaternion to multiply by
+ */
+static void __attribute__((overloadable))
+rsQuaternionMultiply(rs_quaternion *q, const rs_quaternion *rhs) {
+    rs_quaternion qtmp;
+    rsQuaternionSet(&qtmp, q);
+
+    q->w = qtmp.w*rhs->w - qtmp.x*rhs->x - qtmp.y*rhs->y - qtmp.z*rhs->z;
+    q->x = qtmp.w*rhs->x + qtmp.x*rhs->w + qtmp.y*rhs->z - qtmp.z*rhs->y;
+    q->y = qtmp.w*rhs->y + qtmp.y*rhs->w + qtmp.z*rhs->x - qtmp.x*rhs->z;
+    q->z = qtmp.w*rhs->z + qtmp.z*rhs->w + qtmp.x*rhs->y - qtmp.y*rhs->x;
+    rsQuaternionNormalize(q);
+}
+
+/**
+ * Performs spherical linear interpolation between two quaternions
+ * @param q result quaternion from interpolation
+ * @param q0 first param
+ * @param q1 second param
+ * @param t how much to interpolate by
+ */
+static void
+rsQuaternionSlerp(rs_quaternion *q, const rs_quaternion *q0, const rs_quaternion *q1, float t) {
+    if (t <= 0.0f) {
+        rsQuaternionSet(q, q0);
+        return;
+    }
+    if (t >= 1.0f) {
+        rsQuaternionSet(q, q1);
+        return;
+    }
+
+    rs_quaternion tempq0, tempq1;
+    rsQuaternionSet(&tempq0, q0);
+    rsQuaternionSet(&tempq1, q1);
+
+    float angle = rsQuaternionDot(q0, q1);
+    if (angle < 0) {
+        rsQuaternionMultiply(&tempq0, -1.0f);
+        angle *= -1.0f;
+    }
+
+    float scale, invScale;
+    if (angle + 1.0f > 0.05f) {
+        if (1.0f - angle >= 0.05f) {
+            float theta = acos(angle);
+            float invSinTheta = 1.0f / sin(theta);
+            scale = sin(theta * (1.0f - t)) * invSinTheta;
+            invScale = sin(theta * t) * invSinTheta;
+        } else {
+            scale = 1.0f - t;
+            invScale = t;
+        }
+    } else {
+        rsQuaternionSet(&tempq1, tempq0.z, -tempq0.y, tempq0.x, -tempq0.w);
+        scale = sin(M_PI * (0.5f - t));
+        invScale = sin(M_PI * t);
+    }
+
+    rsQuaternionSet(q, tempq0.w*scale + tempq1.w*invScale, tempq0.x*scale + tempq1.x*invScale,
+                        tempq0.y*scale + tempq1.y*invScale, tempq0.z*scale + tempq1.z*invScale);
+}
+
+/**
+ * Computes rotation matrix from the normalized quaternion
+ * @param m resulting matrix
+ * @param p normalized quaternion
+ */
+static void rsQuaternionGetMatrixUnit(rs_matrix4x4 *m, const rs_quaternion *q) {
+    float xx = q->x * q->x;
+    float xy = q->x * q->y;
+    float xz = q->x * q->z;
+    float xw = q->x * q->w;
+    float yy = q->y * q->y;
+    float yz = q->y * q->z;
+    float yw = q->y * q->w;
+    float zz = q->z * q->z;
+    float zw = q->z * q->w;
+
+    m->m[0]  = 1.0f - 2.0f * ( yy + zz );
+    m->m[4]  =        2.0f * ( xy - zw );
+    m->m[8]  =        2.0f * ( xz + yw );
+    m->m[1]  =        2.0f * ( xy + zw );
+    m->m[5]  = 1.0f - 2.0f * ( xx + zz );
+    m->m[9]  =        2.0f * ( yz - xw );
+    m->m[2]  =        2.0f * ( xz - yw );
+    m->m[6]  =        2.0f * ( yz + xw );
+    m->m[10] = 1.0f - 2.0f * ( xx + yy );
+    m->m[3]  = m->m[7] = m->m[11] = m->m[12] = m->m[13] = m->m[14] = 0.0f;
+    m->m[15] = 1.0f;
+}
+
+#endif
+

diff --git a/22.0.1/include/rs_sampler.rsh b/22.0.1/include/rs_sampler.rsh
new file mode 100644
index 0000000..2ff426c
--- /dev/null
+++ b/22.0.1/include/rs_sampler.rsh

@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_sampler.rsh
+ *  \brief Sampler routines
+ *
+ *
+ */
+
+#ifndef __RS_SAMPLER_RSH__
+#define __RS_SAMPLER_RSH__
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+/**
+ * Get sampler minification value
+ *
+ * @param s sampler to query
+ * @return minification value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMinification(rs_sampler s);
+
+/**
+ * Get sampler magnification value
+ *
+ * @param s sampler to query
+ * @return magnification value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMagnification(rs_sampler s);
+
+/**
+ * Get sampler wrap S value
+ *
+ * @param s sampler to query
+ * @return wrap S value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapS(rs_sampler s);
+
+/**
+ * Get sampler wrap T value
+ *
+ * @param s sampler to query
+ * @return wrap T value
+ */
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapT(rs_sampler s);
+
+/**
+  Get sampler anisotropy
+ *
+ * @param s sampler to query
+ * @return anisotropy
+ */
+extern float __attribute__((overloadable))
+    rsSamplerGetAnisotropy(rs_sampler s);
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_SAMPLER_RSH__
+

diff --git a/22.0.1/include/rs_time.rsh b/22.0.1/include/rs_time.rsh
new file mode 100644
index 0000000..abcb88b
--- /dev/null
+++ b/22.0.1/include/rs_time.rsh

@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_time.rsh
+ *  \brief RenderScript time routines
+ *
+ *  This file contains RenderScript functions relating to time and date
+ *  manipulation.
+ */
+
+#ifndef __RS_TIME_RSH__
+#define __RS_TIME_RSH__
+
+/**
+ * Calendar time interpreted as seconds elapsed since the Epoch (00:00:00 on
+ * January 1, 1970, Coordinated Universal Time (UTC)).
+ */
+#ifndef __LP64__
+typedef int rs_time_t;
+#else
+typedef long rs_time_t;
+#endif
+
+/**
+ * Data structure for broken-down time components.
+ *
+ * tm_sec   - Seconds after the minute. This ranges from 0 to 59, but possibly
+ *            up to 60 for leap seconds.
+ * tm_min   - Minutes after the hour. This ranges from 0 to 59.
+ * tm_hour  - Hours past midnight. This ranges from 0 to 23.
+ * tm_mday  - Day of the month. This ranges from 1 to 31.
+ * tm_mon   - Months since January. This ranges from 0 to 11.
+ * tm_year  - Years since 1900.
+ * tm_wday  - Days since Sunday. This ranges from 0 to 6.
+ * tm_yday  - Days since January 1. This ranges from 0 to 365.
+ * tm_isdst - Flag to indicate whether daylight saving time is in effect. The
+ *            value is positive if it is in effect, zero if it is not, and
+ *            negative if the information is not available.
+ */
+typedef struct {
+    int tm_sec;     ///< seconds
+    int tm_min;     ///< minutes
+    int tm_hour;    ///< hours
+    int tm_mday;    ///< day of the month
+    int tm_mon;     ///< month
+    int tm_year;    ///< year
+    int tm_wday;    ///< day of the week
+    int tm_yday;    ///< day of the year
+    int tm_isdst;   ///< daylight savings time
+} rs_tm;
+
+/**
+ * Returns the number of seconds since the Epoch (00:00:00 UTC, January 1,
+ * 1970). If @p timer is non-NULL, the result is also stored in the memory
+ * pointed to by this variable. If an error occurs, a value of -1 is returned.
+ *
+ * @param timer Location to also store the returned calendar time.
+ *
+ * @return Seconds since the Epoch.
+ */
+extern rs_time_t __attribute__((overloadable))
+    rsTime(rs_time_t *timer);
+
+/**
+ * Converts the time specified by @p timer into broken-down time and stores it
+ * in @p local. This function also returns a pointer to @p local. If @p local
+ * is NULL, this function does nothing and returns NULL.
+ *
+ * @param local Broken-down time.
+ * @param timer Input time as calendar time.
+ *
+ * @return Pointer to broken-down time (same as input @p local).
+ */
+extern rs_tm * __attribute__((overloadable))
+    rsLocaltime(rs_tm *local, const rs_time_t *timer);
+
+/**
+ * Returns the current system clock (uptime) in milliseconds.
+ *
+ * @return Uptime in milliseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeMillis(void);
+
+/**
+ * Returns the current system clock (uptime) in nanoseconds.
+ *
+ * @return Uptime in nanoseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeNanos(void);
+
+/**
+ * Returns the time in seconds since this function was last called in this
+ * script.
+ *
+ * @return Time in seconds.
+ */
+extern float __attribute__((overloadable))
+    rsGetDt(void);
+
+#endif

diff --git a/22.0.1/include/rs_types.rsh b/22.0.1/include/rs_types.rsh
new file mode 100644
index 0000000..f1fc60b
--- /dev/null
+++ b/22.0.1/include/rs_types.rsh

@@ -0,0 +1,651 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @file rs_types.rsh
+ *
+ *  Define the standard RenderScript types
+ *
+ *  Integers
+ *  8 bit: char, int8_t
+ *  16 bit: short, int16_t
+ *  32 bit: int, in32_t
+ *  64 bit: long, long long, int64_t
+ *
+ *  Unsigned Integers
+ *  8 bit: uchar, uint8_t
+ *  16 bit: ushort, uint16_t
+ *  32 bit: uint, uint32_t
+ *  64 bit: ulong, uint64_t
+ *
+ *  Floating point
+ *  32 bit: float
+ *  64 bit: double
+ *
+ *  Vectors of length 2, 3, and 4 are supported for all the types above.
+ *
+ */
+
+#ifndef __RS_TYPES_RSH__
+#define __RS_TYPES_RSH__
+
+/* Constants */
+#define M_E         2.718281828459045235360287471352662498f     /* e */
+#define M_LOG2E     1.442695040888963407359924681001892137f     /* log_2 e */
+#define M_LOG10E    0.434294481903251827651128918916605082f     /* log_10 e */
+#define M_LN2       0.693147180559945309417232121458176568f     /* log_e 2 */
+#define M_LN10      2.302585092994045684017991454684364208f     /* log_e 10 */
+#define M_PI        3.141592653589793238462643383279502884f     /* pi */
+#define M_PI_2      1.570796326794896619231321691639751442f     /* pi/2 */
+#define M_PI_4      0.785398163397448309615660845819875721f     /* pi/4 */
+#define M_1_PI      0.318309886183790671537767526745028724f     /* 1/pi */
+#define M_2_PIl     0.636619772367581343075535053490057448f     /* 2/pi */
+#define M_2_SQRTPI  1.128379167095512573896158903121545172f     /* 2/sqrt(pi) */
+#define M_SQRT2     1.414213562373095048801688724209698079f     /* sqrt(2) */
+#define M_SQRT1_2   0.707106781186547524400844362104849039f     /* 1/sqrt(2) */
+
+#include "stdbool.h"
+/**
+ * 8 bit integer type
+ */
+typedef char int8_t;
+/**
+ * 16 bit integer type
+ */
+typedef short int16_t;
+/**
+ * 32 bit integer type
+ */
+typedef int int32_t;
+/**
+ * 64 bit integer type
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+    typedef long int64_t;
+#else
+    typedef long long int64_t;
+#endif
+/**
+ * 8 bit unsigned integer type
+ */
+typedef unsigned char uint8_t;
+/**
+ * 16 bit unsigned integer type
+ */
+typedef unsigned short uint16_t;
+/**
+ * 32 bit unsigned integer type
+ */
+typedef unsigned int uint32_t;
+/**
+ * 64 bit unsigned integer type
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+    typedef unsigned long uint64_t;
+#else
+    typedef unsigned long long uint64_t;
+#endif
+/**
+ * 8 bit unsigned integer type
+ */
+typedef uint8_t uchar;
+/**
+ * 16 bit unsigned integer type
+ */
+typedef uint16_t ushort;
+/**
+ * 32 bit unsigned integer type
+ */
+typedef uint32_t uint;
+/**
+ * Typedef for unsigned long (use for 64-bit unsigned integers)
+ */
+typedef uint64_t ulong;
+/**
+ * Typedef for size_t
+ */
+#ifndef __LP64__
+typedef uint32_t size_t;
+typedef int32_t ssize_t;
+#else
+typedef uint64_t size_t;
+typedef int64_t ssize_t;
+#endif
+
+#ifndef __LP64__
+#define RS_BASE_OBJ typedef struct { const int* const p; } __attribute__((packed, aligned(4)))
+#else
+#define RS_BASE_OBJ typedef struct { const long* const p; const long* const r; const long* const v1; const long* const v2; }
+#endif
+
+/**
+ * \brief Opaque handle to a RenderScript element.
+ *
+ * See: android.renderscript.Element
+ */
+RS_BASE_OBJ rs_element;
+/**
+ * \brief Opaque handle to a RenderScript type.
+ *
+ * See: android.renderscript.Type
+ */
+RS_BASE_OBJ rs_type;
+/**
+ * \brief Opaque handle to a RenderScript allocation.
+ *
+ * See: android.renderscript.Allocation
+ */
+RS_BASE_OBJ rs_allocation;
+/**
+ * \brief Opaque handle to a RenderScript sampler object.
+ *
+ * See: android.renderscript.Sampler
+ */
+RS_BASE_OBJ rs_sampler;
+/**
+ * \brief Opaque handle to a RenderScript script object.
+ *
+ * See: android.renderscript.ScriptC
+ */
+RS_BASE_OBJ rs_script;
+
+#ifndef __LP64__
+/**
+ * \brief Opaque handle to a RenderScript mesh object.
+ *
+ * See: android.renderscript.Mesh
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_mesh;
+/**
+ * \brief Opaque handle to a RenderScript Path object.
+ *
+ * See: android.renderscript.Path
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_path;
+/**
+ * \brief Opaque handle to a RenderScript ProgramFragment object.
+ *
+ * See: android.renderscript.ProgramFragment
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_fragment;
+/**
+ * \brief Opaque handle to a RenderScript ProgramVertex object.
+ *
+ * See: android.renderscript.ProgramVertex
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_vertex;
+/**
+ * \brief Opaque handle to a RenderScript ProgramRaster object.
+ *
+ * See: android.renderscript.ProgramRaster
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_raster;
+/**
+ * \brief Opaque handle to a RenderScript ProgramStore object.
+ *
+ * See: android.renderscript.ProgramStore
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_program_store;
+/**
+ * \brief Opaque handle to a RenderScript font object.
+ *
+ * See: android.renderscript.Font
+ */
+typedef struct { const int* const p; } __attribute__((packed, aligned(4))) rs_font;
+#endif // __LP64__
+
+/**
+ * Vector version of the basic float type.
+ * Provides two float fields packed into a single 64 bit field with 64 bit
+ * alignment.
+ */
+typedef float float2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic float type. Provides three float fields packed
+ * into a single 128 bit field with 128 bit alignment.
+ */
+typedef float float3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic float type.
+ * Provides four float fields packed into a single 128 bit field with 128 bit
+ * alignment.
+ */
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic double type. Provides two double fields packed
+ * into a single 128 bit field with 128 bit alignment.
+ */
+typedef double double2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic double type. Provides three double fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef double double3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic double type. Provides four double fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef double double4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic uchar type. Provides two uchar fields packed
+ * into a single 16 bit field with 16 bit alignment.
+ */
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic uchar type. Provides three uchar fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic uchar type. Provides four uchar fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic ushort type. Provides two ushort fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic ushort type. Provides three ushort fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic ushort type. Provides four ushort fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic uint type. Provides two uint fields packed into a
+ * single 64 bit field with 64 bit alignment.
+ */
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic uint type. Provides three uint fields packed into
+ * a single 128 bit field with 128 bit alignment.
+ */
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic uint type. Provides four uint fields packed into
+ * a single 128 bit field with 128 bit alignment.
+ */
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic ulong type. Provides two ulong fields packed into
+ * a single 128 bit field with 128 bit alignment.
+ */
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic ulong type. Provides three ulong fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef ulong ulong3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic ulong type. Provides four ulong fields packed
+ * into a single 256 bit field with 256 bit alignment.
+ */
+typedef ulong ulong4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic char type. Provides two char fields packed into a
+ * single 16 bit field with 16 bit alignment.
+ */
+typedef char char2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic char type. Provides three char fields packed into
+ * a single 32 bit field with 32 bit alignment.
+ */
+typedef char char3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic char type. Provides four char fields packed into
+ * a single 32 bit field with 32 bit alignment.
+ */
+typedef char char4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic short type. Provides two short fields packed into
+ * a single 32 bit field with 32 bit alignment.
+ */
+typedef short short2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic short type. Provides three short fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef short short3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic short type. Provides four short fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+typedef short short4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic int type. Provides two int fields packed into a
+ * single 64 bit field with 64 bit alignment.
+ */
+typedef int int2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic int type. Provides three int fields packed into a
+ * single 128 bit field with 128 bit alignment.
+ */
+typedef int int3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic int type. Provides two four fields packed into a
+ * single 128 bit field with 128 bit alignment.
+ */
+typedef int int4 __attribute__((ext_vector_type(4)));
+
+/**
+ * Vector version of the basic long type. Provides two long fields packed into a
+ * single 128 bit field with 128 bit alignment.
+ */
+typedef long long2 __attribute__((ext_vector_type(2)));
+/**
+ * Vector version of the basic long type. Provides three long fields packed into
+ * a single 256 bit field with 256 bit alignment.
+ */
+typedef long long3 __attribute__((ext_vector_type(3)));
+/**
+ * Vector version of the basic long type. Provides four long fields packed into
+ * a single 256 bit field with 256 bit alignment.
+ */
+typedef long long4 __attribute__((ext_vector_type(4)));
+
+/**
+ * \brief 4x4 float matrix
+ *
+ * Native holder for RS matrix.  Elements are stored in the array at the
+ * location [row*4 + col]
+ */
+typedef struct {
+    float m[16];
+} rs_matrix4x4;
+/**
+ * \brief 3x3 float matrix
+ *
+ * Native holder for RS matrix.  Elements are stored in the array at the
+ * location [row*3 + col]
+ */
+typedef struct {
+    float m[9];
+} rs_matrix3x3;
+/**
+ * \brief 2x2 float matrix
+ *
+ * Native holder for RS matrix.  Elements are stored in the array at the
+ * location [row*2 + col]
+ */
+typedef struct {
+    float m[4];
+} rs_matrix2x2;
+
+/**
+ * quaternion type for use with the quaternion functions
+ */
+typedef float4 rs_quaternion;
+
+#define RS_PACKED __attribute__((packed, aligned(4)))
+#define NULL ((void *)0)
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+
+/**
+ * \brief Enum for selecting cube map faces
+ */
+typedef enum {
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X = 0,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_X = 1,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Y = 2,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Y = 3,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Z = 4,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Z = 5
+} rs_allocation_cubemap_face;
+
+/**
+ * \brief Bitfield to specify the usage types for an allocation.
+ *
+ * These values are ORed together to specify which usages or memory spaces are
+ * relevant to an allocation or an operation on an allocation.
+ */
+typedef enum {
+    RS_ALLOCATION_USAGE_SCRIPT = 0x0001,
+    RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002,
+    RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004,
+    RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008,
+    RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010
+} rs_allocation_usage_type;
+
+#endif //defined(RS_VERSION) && (RS_VERSION >= 14)
+
+// New API's
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#ifndef __LP64__
+/**
+ * Describes the way mesh vertex data is interpreted when rendering
+ *
+ **/
+typedef enum {
+    /**
+    * Vertex data will be rendered as a series of points
+    */
+    RS_PRIMITIVE_POINT              = 0,
+    /**
+    * Vertex pairs will be rendered as lines
+    */
+    RS_PRIMITIVE_LINE               = 1,
+    /**
+    * Vertex data will be rendered as a connected line strip
+    */
+    RS_PRIMITIVE_LINE_STRIP         = 2,
+    /**
+    * Vertices will be rendered as individual triangles
+    */
+    RS_PRIMITIVE_TRIANGLE           = 3,
+    /**
+    * Vertices will be rendered as a connected triangle strip
+    * defined by the first three vertices with each additional
+    * triangle defined by a new vertex
+    */
+    RS_PRIMITIVE_TRIANGLE_STRIP     = 4,
+    /**
+    * Vertices will be rendered as a sequence of triangles that all
+    * share first vertex as the origin
+    */
+    RS_PRIMITIVE_TRIANGLE_FAN       = 5,
+
+    /**
+    * Invalid primitive
+    */
+    RS_PRIMITIVE_INVALID            = 100,
+} rs_primitive;
+#endif // __LP64__
+
+/**
+ * \brief Enumeration for possible element data types
+ *
+ * DataType represents the basic type information for a basic element.  The
+ * naming convention follows.  For numeric types it is FLOAT,
+ * SIGNED, or UNSIGNED followed by the _BITS where BITS is the
+ * size of the data.  BOOLEAN is a true / false (1,0)
+ * represented in an 8 bit container.  The UNSIGNED variants
+ * with multiple bit definitions are for packed graphical data
+ * formats and represent vectors with per vector member sizes
+ * which are treated as a single unit for packing and alignment
+ * purposes.
+ *
+ * MATRIX the three matrix types contain FLOAT_32 elements and are treated
+ * as 32 bits for alignment purposes.
+ *
+ * RS_* objects.  32 bit opaque handles.
+ */
+typedef enum {
+    RS_TYPE_NONE             = 0,
+    RS_TYPE_FLOAT_32         = 2,
+    RS_TYPE_FLOAT_64         = 3,
+    RS_TYPE_SIGNED_8         = 4,
+    RS_TYPE_SIGNED_16        = 5,
+    RS_TYPE_SIGNED_32        = 6,
+    RS_TYPE_SIGNED_64        = 7,
+    RS_TYPE_UNSIGNED_8       = 8,
+    RS_TYPE_UNSIGNED_16      = 9,
+    RS_TYPE_UNSIGNED_32      = 10,
+    RS_TYPE_UNSIGNED_64      = 11,
+
+    RS_TYPE_BOOLEAN          = 12,
+
+    RS_TYPE_UNSIGNED_5_6_5   = 13,
+    RS_TYPE_UNSIGNED_5_5_5_1 = 14,
+    RS_TYPE_UNSIGNED_4_4_4_4 = 15,
+
+    RS_TYPE_MATRIX_4X4       = 16,
+    RS_TYPE_MATRIX_3X3       = 17,
+    RS_TYPE_MATRIX_2X2       = 18,
+
+    RS_TYPE_ELEMENT          = 1000,
+    RS_TYPE_TYPE             = 1001,
+    RS_TYPE_ALLOCATION       = 1002,
+    RS_TYPE_SAMPLER          = 1003,
+    RS_TYPE_SCRIPT           = 1004,
+    RS_TYPE_MESH             = 1005,
+    RS_TYPE_PROGRAM_FRAGMENT = 1006,
+    RS_TYPE_PROGRAM_VERTEX   = 1007,
+    RS_TYPE_PROGRAM_RASTER   = 1008,
+    RS_TYPE_PROGRAM_STORE    = 1009,
+    RS_TYPE_FONT             = 1010,
+
+    RS_TYPE_INVALID          = 10000,
+} rs_data_type;
+
+/**
+ * \brief Enumeration for possible element data kind
+ *
+ * The special interpretation of the data if required.  This is primarly
+ * useful for graphical data.  USER indicates no special interpretation is
+ * expected.  PIXEL is used in conjunction with the standard data types for
+ * representing texture formats.
+ */
+typedef enum {
+    RS_KIND_USER         = 0,
+
+    RS_KIND_PIXEL_L      = 7,
+    RS_KIND_PIXEL_A      = 8,
+    RS_KIND_PIXEL_LA     = 9,
+    RS_KIND_PIXEL_RGB    = 10,
+    RS_KIND_PIXEL_RGBA   = 11,
+    RS_KIND_PIXEL_DEPTH  = 12,
+    RS_KIND_PIXEL_YUV    = 13,
+
+    RS_KIND_INVALID      = 100,
+} rs_data_kind;
+
+#ifndef __LP64__
+typedef enum {
+    /**
+    * Always drawn
+    */
+    RS_DEPTH_FUNC_ALWAYS        = 0,
+    /**
+    * Drawn if the incoming depth value is less than that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_LESS          = 1,
+    /**
+    * Drawn if the incoming depth value is less or equal to that in
+    * the depth buffer
+    */
+    RS_DEPTH_FUNC_LEQUAL        = 2,
+    /**
+    * Drawn if the incoming depth value is greater than that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_GREATER       = 3,
+    /**
+    * Drawn if the incoming depth value is greater or equal to that
+    * in the depth buffer
+    */
+    RS_DEPTH_FUNC_GEQUAL        = 4,
+    /**
+    * Drawn if the incoming depth value is equal to that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_EQUAL         = 5,
+    /**
+    * Drawn if the incoming depth value is not equal to that in the
+    * depth buffer
+    */
+    RS_DEPTH_FUNC_NOTEQUAL      = 6,
+    /**
+    * Invalid depth function
+    */
+    RS_DEPTH_FUNC_INVALID       = 100,
+} rs_depth_func;
+
+typedef enum {
+    RS_BLEND_SRC_ZERO                   = 0,
+    RS_BLEND_SRC_ONE                    = 1,
+    RS_BLEND_SRC_DST_COLOR              = 2,
+    RS_BLEND_SRC_ONE_MINUS_DST_COLOR    = 3,
+    RS_BLEND_SRC_SRC_ALPHA              = 4,
+    RS_BLEND_SRC_ONE_MINUS_SRC_ALPHA    = 5,
+    RS_BLEND_SRC_DST_ALPHA              = 6,
+    RS_BLEND_SRC_ONE_MINUS_DST_ALPHA    = 7,
+    RS_BLEND_SRC_SRC_ALPHA_SATURATE     = 8,
+
+    RS_BLEND_SRC_INVALID                = 100,
+} rs_blend_src_func;
+
+typedef enum {
+    RS_BLEND_DST_ZERO                   = 0,
+    RS_BLEND_DST_ONE                    = 1,
+    RS_BLEND_DST_SRC_COLOR              = 2,
+    RS_BLEND_DST_ONE_MINUS_SRC_COLOR    = 3,
+    RS_BLEND_DST_SRC_ALPHA              = 4,
+    RS_BLEND_DST_ONE_MINUS_SRC_ALPHA    = 5,
+    RS_BLEND_DST_DST_ALPHA              = 6,
+    RS_BLEND_DST_ONE_MINUS_DST_ALPHA    = 7,
+
+    RS_BLEND_DST_INVALID                = 100,
+} rs_blend_dst_func;
+
+typedef enum {
+    RS_CULL_BACK     = 0,
+    RS_CULL_FRONT    = 1,
+    RS_CULL_NONE     = 2,
+
+    RS_CULL_INVALID  = 100,
+} rs_cull_mode;
+#endif //__LP64__
+
+typedef enum {
+    RS_SAMPLER_NEAREST              = 0,
+    RS_SAMPLER_LINEAR               = 1,
+    RS_SAMPLER_LINEAR_MIP_LINEAR    = 2,
+    RS_SAMPLER_WRAP                 = 3,
+    RS_SAMPLER_CLAMP                = 4,
+    RS_SAMPLER_LINEAR_MIP_NEAREST   = 5,
+    RS_SAMPLER_MIRRORED_REPEAT      = 6,
+
+    RS_SAMPLER_INVALID              = 100,
+} rs_sampler_value;
+
+#endif // (defined(RS_VERSION) && (RS_VERSION >= 16))
+
+#endif // __RS_TYPES_RSH__

diff --git a/22.0.1/lib/libLLVM.so b/22.0.1/lib/libLLVM.so
new file mode 100755
index 0000000..bce33cc
--- /dev/null
+++ b/22.0.1/lib/libLLVM.so
Binary files differ

diff --git a/22.0.1/lib/libc++.so b/22.0.1/lib/libc++.so
new file mode 100755
index 0000000..a297db2
--- /dev/null
+++ b/22.0.1/lib/libc++.so
Binary files differ

diff --git a/22.0.1/lib/libclang.so b/22.0.1/lib/libclang.so
new file mode 100755
index 0000000..ce91323
--- /dev/null
+++ b/22.0.1/lib/libclang.so
Binary files differ

diff --git a/23.0.3/bin/llvm-rs-cc b/23.0.3/bin/llvm-rs-cc
new file mode 100755
index 0000000..16742f2
--- /dev/null
+++ b/23.0.3/bin/llvm-rs-cc
Binary files differ

diff --git a/23.0.3/clang-include/CMakeLists.txt b/23.0.3/clang-include/CMakeLists.txt
new file mode 100644
index 0000000..5429092
--- /dev/null
+++ b/23.0.3/clang-include/CMakeLists.txt

@@ -0,0 +1,93 @@
+set(files
+  adxintrin.h
+  altivec.h
+  ammintrin.h
+  arm_acle.h
+  avx2intrin.h
+  avx512bwintrin.h
+  avx512erintrin.h
+  avx512fintrin.h
+  avx512vlbwintrin.h
+  avx512vlintrin.h
+  avxintrin.h
+  bmi2intrin.h
+  bmiintrin.h
+  cpuid.h
+  emmintrin.h
+  f16cintrin.h
+  float.h
+  fma4intrin.h
+  fmaintrin.h
+  htmintrin.h
+  htmxlintrin.h
+  ia32intrin.h
+  immintrin.h
+  Intrin.h
+  iso646.h
+  limits.h
+  lzcntintrin.h
+  mm3dnow.h
+  mmintrin.h
+  mm_malloc.h
+  module.modulemap
+  nmmintrin.h
+  pmmintrin.h
+  popcntintrin.h
+  prfchwintrin.h
+  rdseedintrin.h
+  rtmintrin.h
+  s390intrin.h
+  shaintrin.h
+  smmintrin.h
+  stdalign.h
+  stdarg.h
+  stdatomic.h
+  stdbool.h
+  stddef.h
+  __stddef_max_align_t.h
+  stdint.h
+  stdnoreturn.h
+  tbmintrin.h
+  tgmath.h
+  tmmintrin.h
+  unwind.h
+  vadefs.h
+  varargs.h
+  __wmmintrin_aes.h
+  wmmintrin.h
+  __wmmintrin_pclmul.h
+  x86intrin.h
+  xmmintrin.h
+  xopintrin.h
+  )
+
+set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
+
+# Generate arm_neon.h
+clang_tablegen(arm_neon.h -gen-arm-neon
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+
+set(out_files)
+foreach( f ${files} )
+  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
+  set( dst ${output_dir}/${f} )
+  add_custom_command(OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+    COMMENT "Copying clang's ${f}...")
+  list(APPEND out_files ${dst})
+endforeach( f )
+
+add_custom_command(OUTPUT ${output_dir}/arm_neon.h 
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
+  COMMENT "Copying clang's arm_neon.h...")
+list(APPEND out_files ${output_dir}/arm_neon.h)
+
+add_custom_target(clang-headers ALL DEPENDS ${out_files})
+set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
+
+install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)

diff --git a/23.0.3/clang-include/Intrin.h b/23.0.3/clang-include/Intrin.h
new file mode 100644
index 0000000..727a55e
--- /dev/null
+++ b/23.0.3/clang-include/Intrin.h

@@ -0,0 +1,965 @@
+/* ===-------- Intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <Intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+__m64 _m_from_int(int _l);
+void _m_prefetch(void *);
+float _m_to_float(__m64);
+int _m_to_int(__m64 _M);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+void __debugbreak(void);
+__int64 __emul(int, int);
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+static __inline__
+unsigned int __popcnt(unsigned int);
+static __inline__
+unsigned short __popcnt16(unsigned short);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned long __readfsdword(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned long __cdecl _byteswap_ulong(unsigned long);
+unsigned short __cdecl _byteswap_ushort(unsigned short);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+void __cdecl _fxrstor(void const *);
+void __cdecl _fxsave(void *);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+static __inline__
+long _InterlockedAnd(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedAnd16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedAnd8(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+static __inline__
+long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
+                                         long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+static __inline__
+short _InterlockedCompareExchange16(short volatile *_Destination,
+                                    short _Exchange, short _Comparand);
+static __inline__
+__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
+                                      __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+static __inline__
+char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
+                                  char _Comparand);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+static __inline__
+long __cdecl _InterlockedDecrement(long volatile *_Addend);
+static __inline__
+short _InterlockedDecrement16(short volatile *_Addend);
+long _InterlockedExchange(long volatile *_Target, long _Value);
+static __inline__
+short _InterlockedExchange16(short volatile *_Target, short _Value);
+static __inline__
+char _InterlockedExchange8(char volatile *_Target, char _Value);
+static __inline__
+long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+static __inline__
+short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+static __inline__
+char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
+static __inline__
+long __cdecl _InterlockedIncrement(long volatile *_Addend);
+static __inline__
+short _InterlockedIncrement16(short volatile *_Addend);
+static __inline__
+long _InterlockedOr(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedOr16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedOr8(char volatile *_Value, char _Mask);
+static __inline__
+long _InterlockedXor(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedXor16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedXor8(char volatile *_Value, char _Mask);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__
+unsigned long __cdecl _lrotl(unsigned long, int);
+static __inline__
+unsigned long __cdecl _lrotr(unsigned long, int);
+static __inline__
+static __inline__
+void _ReadBarrier(void);
+static __inline__
+void _ReadWriteBarrier(void);
+static __inline__
+void *_ReturnAddress(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+static __inline__
+unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
+static __inline__
+unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__
+void _WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xrstor(void const *, unsigned __int64);
+void __cdecl _xsave(void *, unsigned __int64);
+void __cdecl _xsaveopt(void *, unsigned __int64);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __popcnt64(unsigned __int64);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+void __cdecl _fxrstor64(void const *);
+void __cdecl _fxsave64(void *);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
+                                         void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+static __inline__
+__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
+                __int64 *_HighProduct);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmpex(jmp_buf);
+#endif
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+/*
+ * Multiply two 64-bit integers and obtain a 64-bit result.
+ * The low-half is returned directly and the high half is in an out parameter.
+ */
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
+         unsigned __int64 *_HighProduct) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  *_HighProduct = _FullProduct >> 64;
+  return _FullProduct;
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  return _FullProduct >> 64;
+}
+void __cdecl _xrstor64(void const *, unsigned __int64);
+void __cdecl _xsave64(void *, unsigned __int64);
+void __cdecl _xsaveopt64(void *, unsigned __int64);
+
+#endif /* __x86_64__ */
+
+/*----------------------------------------------------------------------------*\
+|* Bit Twiddling
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_rotl8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_rotr8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_rotl16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_rotr16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_rotl(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_rotr(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+_lrotl(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+_lrotr(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_rotl64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+_rotr64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 31 - __builtin_clzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__popcnt16(unsigned short value) {
+  return __builtin_popcount((int)value);
+}
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__popcnt(unsigned int value) {
+  return __builtin_popcount(value);
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittest(long const *a, long b) {
+  return (*a >> b) & 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandcomplement(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a ^ (1 << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandreset(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a & ~(1 << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandset(long *a, long b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a | (1 << b);
+  return x;
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_interlockedbittestandset(long volatile *__BitBase, long __BitPos) {
+  unsigned char __Res;
+  __asm__ ("xor %0, %0\n"
+           "lock bts %2, %1\n"
+           "setc %0\n"
+           : "=r" (__Res), "+m"(*__BitBase)
+           : "Ir"(__BitPos));
+  return __Res;
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzll(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 63 - __builtin_clzll(_Mask);
+  return 1;
+}
+static __inline__
+unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+ __popcnt64(unsigned __int64 value) {
+  return __builtin_popcountll(value);
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittest64(__int64 const *a, __int64 b) {
+  return (*a >> b) & 1;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandcomplement64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a ^ (1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandreset64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a & ~(1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_bittestandset64(__int64 *a, __int64 b) {
+  unsigned char x = (*a >> b) & 1;
+  *a = *a | (1ll << b);
+  return x;
+}
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+_interlockedbittestandset64(__int64 volatile *__BitBase, __int64 __BitPos) {
+  unsigned char __Res;
+  __asm__ ("xor %0, %0\n"
+           "lock bts %2, %1\n"
+           "setc %0\n"
+           : "=r" (__Res), "+m"(*__BitBase)
+           : "Ir"(__BitPos));
+  return __Res;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_add_fetch(_Addend, _Value, 0) - _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Sub
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
+  return __atomic_sub_fetch(_Subend, _Value, 0) + _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedIncrement16(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedIncrement64(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedDecrement16(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedDecrement64(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd8(char volatile *_Value, char _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd16(short volatile *_Value, short _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd(long volatile *_Value, long _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr8(char volatile *_Value, char _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr16(short volatile *_Value, short _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr(long volatile *_Value, long _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor8(char volatile *_Value, char _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor16(short volatile *_Value, short _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+static __inline__ long __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor(long volatile *_Value, long _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, 0);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange8(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange16(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, 0);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange8(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+static __inline__ short __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange16(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+static __inline__ __int64 __attribute__((__always_inline__, __nodebug__))
+_InterlockedCompareExchange64(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, 0, 0);
+  return _Comparand;
+}
+/*----------------------------------------------------------------------------*\
+|* Barriers
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void) {
+  __asm__ volatile ("" : : : "memory");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__faststorefence(void) {
+  __asm__ volatile("lock orq $0, (%%rsp)" : : : "memory");
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsh" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
+  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosh" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+_AddressOfReturnAddress(void) {
+  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
+}
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+_ReturnAddress(void) {
+  return __builtin_return_address(0);
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __attribute__((__always_inline__, __nodebug__))
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __attribute__((__always_inline__, __nodebug__))
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */

diff --git a/23.0.3/clang-include/LICENSE.TXT b/23.0.3/clang-include/LICENSE.TXT
new file mode 100644
index 0000000..fc4afae
--- /dev/null
+++ b/23.0.3/clang-include/LICENSE.TXT

@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2015 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+

diff --git a/23.0.3/clang-include/adxintrin.h b/23.0.3/clang-include/adxintrin.h
new file mode 100644
index 0000000..9db8bcb
--- /dev/null
+++ b/23.0.3/clang-include/adxintrin.h

@@ -0,0 +1,83 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Intrinsics that are available only if __ADX__ defined */
+#ifdef __ADX__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+               unsigned int *__p)
+{
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__))
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#endif /* __ADXINTRIN_H */

diff --git a/23.0.3/clang-include/altivec.h b/23.0.3/clang-include/altivec.h
new file mode 100644
index 0000000..252bf36
--- /dev/null
+++ b/23.0.3/clang-include/altivec.h

@@ -0,0 +1,13566 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ     0
+#define __CR6_EQ_REV 1
+#define __CR6_LT     2
+#define __CR6_LT_REV 3
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+static vector signed char __ATTRS_o_ai
+vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a,
+         vector unsigned char __b,
+         vector unsigned char __c);
+
+static vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c);
+
+static vector short __ATTRS_o_ai
+vec_perm(vector short __a, vector short __b, vector unsigned char __c);
+
+static vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned char __c);
+
+static vector bool short __ATTRS_o_ai
+vec_perm(vector bool short __a, vector bool short __b, vector unsigned char __c);
+
+static vector pixel __ATTRS_o_ai
+vec_perm(vector pixel __a, vector pixel __b, vector unsigned char __c);
+
+static vector int __ATTRS_o_ai
+vec_perm(vector int __a, vector int __b, vector unsigned char __c);
+
+static vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c);
+
+static vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
+
+static vector float __ATTRS_o_ai
+vec_perm(vector float __a, vector float __b, vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi  vec_abs
+#define __builtin_altivec_abs_v4si  vec_abs
+
+static vector signed char __ATTRS_o_ai
+vec_abs(vector signed char __a)
+{
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_abs(vector signed short __a)
+{
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static vector signed int __ATTRS_o_ai
+vec_abs(vector signed int __a)
+{
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+static vector float __ATTRS_o_ai
+vec_abs(vector float __a)
+{
+  vector unsigned int __res = (vector unsigned int)__a
+                            & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+}
+
+/* vec_abss */
+
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi  vec_abss
+#define __builtin_altivec_abss_v4si  vec_abss
+
+static vector signed char __ATTRS_o_ai
+vec_abss(vector signed char __a)
+{
+  return __builtin_altivec_vmaxsb
+           (__a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static vector signed short __ATTRS_o_ai
+vec_abss(vector signed short __a)
+{
+  return __builtin_altivec_vmaxsh
+           (__a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static vector signed int __ATTRS_o_ai
+vec_abss(vector signed int __a)
+{
+  return __builtin_altivec_vmaxsw
+           (__a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_add */
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector signed char __b)
+{
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector bool char __b)
+{
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector bool char __b)
+{
+  return __a + (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector short __a, vector short __b)
+{
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector bool short __a, vector short __b)
+{
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_add(vector short __a, vector bool short __b)
+{
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector bool short __b)
+{
+  return __a + (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector int __a, vector int __b)
+{
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector bool int __a, vector int __b)
+{
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_add(vector int __a, vector bool int __b)
+{
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector bool int __b)
+{
+  return __a + (vector unsigned int)__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_add(vector float __a, vector float __b)
+{
+  return __a + __b;
+}
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector signed char __b)
+{
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector bool char __b)
+{
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector bool char __b)
+{
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector short __a, vector short __b)
+{
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector short __b)
+{
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vadduhm(vector short __a, vector bool short __b)
+{
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector bool short __b)
+{
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector int __a, vector int __b)
+{
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector int __b)
+{
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vadduwm(vector int __a, vector bool int __b)
+{
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector bool int __b)
+{
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp  vec_vaddfp
+
+static vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b)
+{
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_addc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_vaddcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_adds(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_adds(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vaddshs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vaddsws(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector signed char __b)
+{
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector bool char __b)
+{
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector bool char __b)
+{
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_and(vector bool char __a, vector bool char __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector short __a, vector short __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_and(vector short __a, vector bool short __b)
+{
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector bool short __b)
+{
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_and(vector bool short __a, vector bool short __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector int __a, vector int __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_and(vector int __a, vector bool int __b)
+{
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector bool int __b)
+{
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_and(vector bool int __a, vector bool int __b)
+{
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_and(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vand */
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector signed char __b)
+{
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector bool char __b)
+{
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector bool char __b)
+{
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector bool char __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector short __a, vector short __b)
+{
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vand(vector short __a, vector bool short __b)
+{
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector bool short __b)
+{
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector bool short __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector int __a, vector int __b)
+{
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vand(vector int __a, vector bool int __b)
+{
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector bool int __b)
+{
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector bool int __b)
+{
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vand(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector signed char __b)
+{
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector bool char __b)
+{
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector bool char __b)
+{
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector bool char __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector short __a, vector short __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_andc(vector short __a, vector bool short __b)
+{
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector bool short __b)
+{
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector bool short __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector int __a, vector int __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_andc(vector int __a, vector bool int __b)
+{
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector bool int __b)
+{
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector bool int __b)
+{
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_andc(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vandc */
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector signed char __b)
+{
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector bool char __b)
+{
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector bool char __b)
+{
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector bool char __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector short __a, vector short __b)
+{
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector short __b)
+{
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vandc(vector short __a, vector bool short __b)
+{
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector bool short __b)
+{
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector bool short __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector int __a, vector int __b)
+{
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector int __b)
+{
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vandc(vector int __a, vector bool int __b)
+{
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector bool int __b)
+{
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector bool int __b)
+{
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vandc(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_avg */
+
+static vector signed char __ATTRS_o_ai
+vec_avg(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_avg(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_avg(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_avg(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_avg(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_avg(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static vector float __attribute__((__always_inline__))
+vec_ceil(vector float __a)
+{
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_vrfip */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a)
+{
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpeq(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)
+    __builtin_altivec_vcmpequb((vector char)__a, (vector char)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)
+    __builtin_altivec_vcmpequb((vector char)__a, (vector char)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpeq(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)
+    __builtin_altivec_vcmpequh((vector short)__a, (vector short)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)
+    __builtin_altivec_vcmpequw((vector int)__a, (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector signed long long __a, vector signed long long __b) 
+{
+  return (vector bool long long) __builtin_altivec_vcmpequd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) 
+{
+  return (vector bool long long) 
+    __builtin_altivec_vcmpequd((vector long long)__a, (vector long long) __b);
+}
+#endif
+
+static vector bool int __ATTRS_o_ai
+vec_cmpeq(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+}
+
+/* vec_cmpge */
+
+static vector bool int __attribute__((__always_inline__))
+vec_cmpge(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgefp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_cmpgt */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpgt(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpgt(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector signed long long __a, vector signed long long __b)
+{
+  return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
+}
+#endif
+
+static vector bool int __ATTRS_o_ai
+vec_cmpgt(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static vector bool int __attribute__((__always_inline__))
+vec_cmple(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgefp(__b, __a);
+}
+
+/* vec_cmplt */
+
+static vector bool char __ATTRS_o_ai
+vec_cmplt(vector signed char __a, vector signed char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmplt(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vcmpgtub(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmplt(vector short __a, vector short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmplt(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector int __a, vector int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmplt(vector float __a, vector float __b)
+{
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__b, __a);
+}
+
+/* vec_ctf */
+
+static vector float __ATTRS_o_ai
+vec_ctf(vector int __a, int __b)
+{
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ctf(vector unsigned int __a, int __b)
+{
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_vcfsx */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b)
+{
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b)
+{
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static vector int __attribute__((__always_inline__))
+vec_cts(vector float __a, int __b)
+{
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_vctsxs */
+
+static vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b)
+{
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_ctu(vector float __a, int __b)
+{
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_vctuxs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b)
+{
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_div */
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_div(vector float __a, vector float __b)
+{
+  return __builtin_vsx_xvdivsp(__a, __b);
+}
+
+static vector double __ATTRS_o_ai
+vec_div(vector double __a, vector double __b)
+{
+  return __builtin_vsx_xvdivdp(__a, __b);
+}
+#endif
+
+/* vec_dss */
+
+static void __attribute__((__always_inline__))
+vec_dss(int __a)
+{
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static void __attribute__((__always_inline__))
+vec_dssall(void)
+{
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+
+static void __attribute__((__always_inline__))
+vec_dst(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dst(__a, __b, __c);
+}
+
+/* vec_dstst */
+
+static void __attribute__((__always_inline__))
+vec_dstst(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dstst(__a, __b, __c);
+}
+
+/* vec_dststt */
+
+static void __attribute__((__always_inline__))
+vec_dststt(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dststt(__a, __b, __c);
+}
+
+/* vec_dstt */
+
+static void __attribute__((__always_inline__))
+vec_dstt(const void *__a, int __b, int __c)
+{
+  __builtin_altivec_dstt(__a, __b, __c);
+}
+
+/* vec_expte */
+
+static vector float __attribute__((__always_inline__))
+vec_expte(vector float __a)
+{
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a)
+{
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static vector float __attribute__((__always_inline__))
+vec_floor(vector float __a)
+{
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_vrfim */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a)
+{
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static vector signed char __ATTRS_o_ai
+vec_ld(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_ld(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_ld(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ld(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ld(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_ld(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_ld(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ld(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ld(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_ld(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ld(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ld(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvx(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvx(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvx(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvx(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvx(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvx(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvx(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvx(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvx(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvx(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static vector signed char __ATTRS_o_ai
+vec_lde(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lde(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lde(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lde(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lde(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lde(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lde(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvebx(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvebx(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static vector short __ATTRS_o_ai
+vec_lvehx(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvehx(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static vector int __ATTRS_o_ai
+vec_lvewx(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvewx(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvewx(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_ldl(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ldl(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_ldl(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_ldl(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_ldl(int __a, const vector pixel *__b)
+{
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ldl(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_ldl(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_ldl(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ldl(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_ldl(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const vector signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const signed char *__b)
+{
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool char *__b)
+{
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvxl(int __a, const vector short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_lvxl(int __a, const short *__b)
+{
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool short *__b)
+{
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvxl(int __a, const vector pixel *__b)
+{
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvxl(int __a, const vector int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_lvxl(int __a, const int *__b)
+{
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool int *__b)
+{
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvxl(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_lvxl(int __a, const float *__b)
+{
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static vector float __attribute__((__always_inline__))
+vec_loge(vector float __a)
+{
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a)
+{
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const signed char *__b)
+{
+  vector unsigned char mask = 
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const signed char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const unsigned char *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const short *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const unsigned short *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const int *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const unsigned int *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsl(int __a, const float *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const float *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+/* vec_lvsr */
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const signed char *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const signed char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const unsigned char *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned char *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const short *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const unsigned short *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned short *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const int *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const unsigned int *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned int *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+__attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores")))
+vec_lvsr(int __a, const float *__b)
+{
+  vector unsigned char mask =
+    (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const float *__b)
+{
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+/* vec_madd */
+
+static vector float __attribute__((__always_inline__))
+vec_madd(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_vmaddfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b, vector signed short __c)
+{
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a,
+              vector signed short __b,
+              vector signed short __c)
+{
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_max */
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_max(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_max(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector signed long long __b) 
+{
+  return __builtin_altivec_vmaxsd(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vmaxud(__a, __b);
+}
+#endif
+
+static vector float __ATTRS_o_ai
+vec_max(vector float __a, vector float __b)
+{
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_max(vector double __a, vector double __b)
+{
+  return __builtin_vsx_xvmaxdp(__a, __b);
+}
+#endif
+
+/* vec_vmaxsb */
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vmaxsh(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vmaxsw(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b)
+{
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+/* vec_mergeh */
+
+static vector signed char __ATTRS_o_ai
+vec_mergeh(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_mergeh(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_mergeh(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector short __ATTRS_o_ai
+vec_mergeh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_mergeh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_mergeh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector int __ATTRS_o_ai
+vec_mergeh(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergeh(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_mergeh(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai
+vec_mergeh(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static vector signed char __ATTRS_o_ai
+vec_vmrghb(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmrghb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vmrghb(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 
+     0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static vector short __ATTRS_o_ai
+vec_vmrghh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vmrghh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vmrghh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+     0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static vector int __ATTRS_o_ai
+vec_vmrghw(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmrghw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vmrghw(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai
+vec_vmrghw(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+     0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static vector signed char __ATTRS_o_ai
+vec_mergel(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_mergel(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_mergel(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector short __ATTRS_o_ai
+vec_mergel(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_mergel(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_mergel(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector int __ATTRS_o_ai
+vec_mergel(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergel(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_mergel(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai
+vec_mergel(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static vector signed char __ATTRS_o_ai
+vec_vmrglb(vector signed char __a, vector signed char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vmrglb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vmrglb(vector bool char __a, vector bool char __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A, 0x0B, 0x1B, 
+     0x0C, 0x1C, 0x0D, 0x1D, 0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static vector short __ATTRS_o_ai
+vec_vmrglh(vector short __a, vector short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vmrglh(vector bool short __a, vector bool short __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vmrglh(vector pixel __a, vector pixel __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static vector int __ATTRS_o_ai
+vec_vmrglw(vector int __a, vector int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vmrglw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vmrglw(vector bool int __a, vector bool int __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai
+vec_vmrglw(vector float __a, vector float __b)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+     0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+/* vec_mfvscr */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void)
+{
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_min(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_min(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vminsd(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vminud(__a, __b);
+}
+#endif
+
+static vector float __ATTRS_o_ai
+vec_min(vector float __a, vector float __b)
+{
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_min(vector double __a, vector double __b)
+{
+  return __builtin_vsx_xvmindp(__a, __b);
+}
+#endif
+
+/* vec_vminsb */
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vminsh(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vminsw(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b)
+{
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector short __a, vector short __b, vector short __c)
+{
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector short __a, vector unsigned short __b, vector unsigned short __c)
+{
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a, vector short __b, vector short __c)
+{
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned short __c)
+{
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector short __a, vector short __b, vector short __c)
+{
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector short __a, vector unsigned short __b, vector unsigned short __c)
+{
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector short __b, vector short __c)
+{
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a,
+              vector unsigned short __b,
+              vector unsigned short __c)
+{
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c)
+{
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c)
+{
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static vector int __ATTRS_o_ai
+vec_msum(vector signed char __a, vector unsigned char __b, vector int __c)
+{
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned char __a, vector unsigned char __b, vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_msum(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c)
+{
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a,
+             vector unsigned char __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a,
+             vector unsigned short __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static vector int __ATTRS_o_ai
+vec_msums(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_msums(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c)
+{
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a,
+             vector unsigned short __b,
+             vector unsigned int __c)
+{
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector signed char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool char __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool short __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector pixel __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector unsigned int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector bool int __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai
+vec_mtvscr(vector float __a)
+{
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static vector short __ATTRS_o_ai
+vec_mule(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mule(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_mule(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mule(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_mule(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosw(__a, __b);
+#else
+  return __builtin_altivec_vmulesw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mule(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouw(__a, __b);
+#else
+  return __builtin_altivec_vmuleuw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulesb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static vector short __ATTRS_o_ai
+vec_mulo(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mulo(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_mulo(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mulo(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_mulo(vector signed int __a, vector signed int __b) 
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesw(__a, __b);
+#else
+  return __builtin_altivec_vmulosw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mulo(vector unsigned int __a, vector unsigned int __b) 
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuw(__a, __b);
+#else
+  return __builtin_altivec_vmulouw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulosb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/* vec_nmsub */
+
+static vector float __attribute__((__always_inline__))
+vec_nmsub(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_vnmsubfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c)
+{
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static vector signed char __ATTRS_o_ai
+vec_nor(vector signed char __a, vector signed char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_nor(vector unsigned char __a, vector unsigned char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_nor(vector bool char __a, vector bool char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_nor(vector short __a, vector short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_nor(vector unsigned short __a, vector unsigned short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_nor(vector bool short __a, vector bool short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_nor(vector int __a, vector int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_nor(vector unsigned int __a, vector unsigned int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_nor(vector bool int __a, vector bool int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_nor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+/* vec_vnor */
+
+static vector signed char __ATTRS_o_ai
+vec_vnor(vector signed char __a, vector signed char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vnor(vector unsigned char __a, vector unsigned char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vnor(vector bool char __a, vector bool char __b)
+{
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vnor(vector short __a, vector short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vnor(vector unsigned short __a, vector unsigned short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vnor(vector bool short __a, vector bool short __b)
+{
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vnor(vector int __a, vector int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vnor(vector unsigned int __a, vector unsigned int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vnor(vector bool int __a, vector bool int __b)
+{
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vnor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector signed char __b)
+{
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector bool char __b)
+{
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector bool char __b)
+{
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_or(vector bool char __a, vector bool char __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector short __a, vector short __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector bool short __a, vector short __b)
+{
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_or(vector short __a, vector bool short __b)
+{
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector bool short __b)
+{
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_or(vector bool short __a, vector bool short __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector int __a, vector int __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector bool int __a, vector int __b)
+{
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_or(vector int __a, vector bool int __b)
+{
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector bool int __b)
+{
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_or(vector bool int __a, vector bool int __b)
+{
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_or(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vor */
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector signed char __b)
+{
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector bool char __b)
+{
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector bool char __b)
+{
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector bool char __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector short __a, vector short __b)
+{
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vor(vector short __a, vector bool short __b)
+{
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector bool short __b)
+{
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector bool short __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector int __a, vector int __b)
+{
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vor(vector int __a, vector bool int __b)
+{
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector bool int __b)
+{
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector bool int __b)
+{
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static vector signed char __ATTRS_o_ai
+vec_pack(vector signed short __a, vector signed short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_pack(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_pack(vector bool short __a, vector bool short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_pack(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_pack(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_pack(vector bool int __a, vector bool int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static vector signed char __ATTRS_o_ai
+vec_vpkuhum(vector signed short __a, vector signed short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vpkuhum(vector bool short __a, vector bool short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+     0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(__a, __b, (vector unsigned char)
+    (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+     0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static vector short __ATTRS_o_ai
+vec_vpkuwum(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkuwum(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vpkuwum(vector bool int __a, vector bool int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+     0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(__a, __b, (vector unsigned char)
+    (0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+     0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_packpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static vector signed char __ATTRS_o_ai
+vec_packs(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_packs(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector signed short __ATTRS_o_ai
+vec_packs(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packs(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpkshss */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpkuhus */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswss */
+
+static vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static vector unsigned char __ATTRS_o_ai
+vec_packsu(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_packsu(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packsu(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_packsu(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpkshus */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector short __a, vector short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector unsigned int __a, vector unsigned int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+static vector signed char __ATTRS_o_ai
+vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a,
+         vector unsigned char __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool char)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector short __ATTRS_o_ai
+vec_perm(vector short __a, vector short __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_perm(vector bool short __a, vector bool short __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool short)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector pixel __ATTRS_o_ai
+vec_perm(vector pixel __a, vector pixel __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector pixel)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_perm(vector int __a, vector int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned int)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool int)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector float __ATTRS_o_ai
+vec_perm(vector float __a, vector float __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector float)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector float)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai
+vec_perm(vector long long __a, vector long long __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector long long)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector long long)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned long long)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned long long)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector double __ATTRS_o_ai
+vec_perm(vector double __a, vector double __b, vector unsigned char __c)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255,255,255,255,255,255,255,255,
+                              255,255,255,255,255,255,255,255};
+  __d = vec_xor(__c, __d);
+  return (vector double)
+           __builtin_altivec_vperm_4si((vector int)__b, (vector int)__a, __d);
+#else
+  return (vector double)
+           __builtin_altivec_vperm_4si((vector int)__a, (vector int)__b, __c);
+#endif
+}
+#endif
+
+/* vec_vperm */
+
+static vector signed char __ATTRS_o_ai
+vec_vperm(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vperm(vector unsigned char __a,
+          vector unsigned char __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vperm(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vperm(vector short __a, vector short __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vperm(vector unsigned short __a,
+          vector unsigned short __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vperm(vector bool short __a, vector bool short __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vperm(vector pixel __a, vector pixel __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vperm(vector int __a, vector int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vperm(vector unsigned int __a, vector unsigned int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vperm(vector bool int __a, vector bool int __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_vperm(vector float __a, vector float __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai
+vec_vperm(vector long long __a, vector long long __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vperm(vector unsigned long long __a, vector unsigned long long __b,
+          vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_vperm(vector double __a, vector double __b, vector unsigned char __c)
+{
+  return vec_perm(__a, __b, __c);
+}
+#endif
+
+/* vec_re */
+
+static vector float __attribute__((__always_inline__))
+vec_re(vector float __a)
+{
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_vrefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a)
+{
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static vector signed char __ATTRS_o_ai
+vec_rl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_rl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_rl(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_rl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_rl(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_rl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_rl(vector signed long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vrld(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_rl(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vrld(__a, __b);
+}
+#endif
+
+/* vec_vrlb */
+
+static vector signed char __ATTRS_o_ai
+vec_vrlb(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vrlb(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static vector short __ATTRS_o_ai
+vec_vrlh(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vrlh(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static vector int __ATTRS_o_ai
+vec_vrlw(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vrlw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static vector float __attribute__((__always_inline__))
+vec_round(vector float __a)
+{
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_vrfin */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a)
+{
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_rsqrte */
+
+static __vector float __attribute__((__always_inline__))
+vec_rsqrte(vector float __a)
+{
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_vrsqrtefp */
+
+static __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a)
+{
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b, vector bool char __c)
+{
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_sel(vector short __a, vector short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai
+vec_sel(vector short __a, vector short __b, vector bool short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a,
+        vector unsigned short __b,
+        vector unsigned short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b, vector bool short __c)
+{
+  return (__a & ~(vector unsigned short)__c) | (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_sel(vector int __a, vector int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai
+vec_sel(vector int __a, vector int __b, vector bool int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c)
+{
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_sel(vector float __a, vector float __b, vector unsigned int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_sel(vector float __a, vector float __b, vector bool int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_vsel */
+
+static vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector bool char __c)
+{
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b, vector bool char __c)
+{
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector unsigned char __c)
+{
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector bool char __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector bool short __c)
+{
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a,
+         vector unsigned short __b,
+         vector unsigned short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b, vector bool short __c)
+{
+  return (__a & ~(vector unsigned short)__c) | (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector unsigned short __c)
+{
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector bool short __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsel(vector int __a, vector int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsel(vector int __a, vector int __b, vector bool int __c)
+{
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsel(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsel(vector unsigned int __a, vector unsigned int __b, vector bool int __c)
+{
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector unsigned int __c)
+{
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector bool int __c)
+{
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsel(vector float __a, vector float __b, vector unsigned int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vsel(vector float __a, vector float __b, vector bool int __c)
+{
+  vector int __res = ((vector int)__a & ~(vector int)__c)
+                   | ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b)
+{
+  return __a << (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sl(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a << __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sl(vector short __a, vector unsigned short __b)
+{
+  return __a << (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sl(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a << __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sl(vector int __a, vector unsigned int __b)
+{
+  return __a << (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sl(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a << __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sl(vector signed long long __a, vector unsigned long long __b)
+{
+  return __a << (vector long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sl(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __a << __b;
+}
+#endif
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static vector signed char __ATTRS_o_ai
+vec_vslb(vector signed char __a, vector unsigned char __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslb(vector unsigned char __a, vector unsigned char __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static vector short __ATTRS_o_ai
+vec_vslh(vector short __a, vector unsigned short __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslh(vector unsigned short __a, vector unsigned short __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static vector int __ATTRS_o_ai
+vec_vslw(vector int __a, vector unsigned int __b)
+{
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslw(vector unsigned int __a, vector unsigned int __b)
+{
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static vector signed char __ATTRS_o_ai
+vec_sld(vector signed char __a, vector signed char __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sld(vector unsigned char __a, vector unsigned char __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector short __ATTRS_o_ai
+vec_sld(vector short __a, vector short __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sld(vector unsigned short __a, vector unsigned short __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sld(vector pixel __a, vector pixel __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector int __ATTRS_o_ai
+vec_sld(vector int __a, vector int __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int __a, vector unsigned int __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector float __ATTRS_o_ai
+vec_sld(vector float __a, vector float __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+/* vec_vsldoi */
+
+static vector signed char __ATTRS_o_ai
+vec_vsldoi(vector signed char __a, vector signed char __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsldoi(vector unsigned char __a, vector unsigned char __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector short __ATTRS_o_ai
+vec_vsldoi(vector short __a, vector short __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsldoi(vector unsigned short __a, vector unsigned short __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsldoi(vector pixel __a, vector pixel __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector int __ATTRS_o_ai
+vec_vsldoi(vector int __a, vector int __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsldoi(vector unsigned int __a, vector unsigned int __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+static vector float __ATTRS_o_ai
+vec_vsldoi(vector float __a, vector float __b, unsigned char __c)
+{
+  return vec_perm(__a, __b, (vector unsigned char)
+    (__c,   __c+1, __c+2,  __c+3,  __c+4,  __c+5,  __c+6,  __c+7,
+     __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15));
+}
+
+/* vec_sll */
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sll(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sll(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sll(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsl */
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsl(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsl(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsl(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+/* vec_slo */
+
+static vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_slo(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_slo(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_slo(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_slo(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_slo(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_slo(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_slo(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_slo(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_vslo */
+
+static vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vslo(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vslo(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vslo(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vslo(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vslo(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vslo(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vslo(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vslo(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static vector signed char __ATTRS_o_ai
+vec_splat(vector signed char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_splat(vector unsigned char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_splat(vector bool char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_splat(vector short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_splat(vector unsigned short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_splat(vector bool short __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_splat(vector pixel __a, unsigned char __b)
+{ 
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector int __ATTRS_o_ai
+vec_splat(vector int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_splat(vector unsigned int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_splat(vector bool int __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai
+vec_splat(vector float __a, unsigned char __b)
+{ 
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static vector signed char __ATTRS_o_ai
+vec_vspltb(vector signed char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vspltb(vector unsigned char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vspltb(vector bool char __a, unsigned char __b)
+{
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static vector short __ATTRS_o_ai
+vec_vsplth(vector short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsplth(vector unsigned short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsplth(vector bool short __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsplth(vector pixel __a, unsigned char __b)
+{
+  __b *= 2;
+  unsigned char b1=__b+1;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static vector int __ATTRS_o_ai
+vec_vspltw(vector int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vspltw(vector unsigned int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vspltw(vector bool int __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai
+vec_vspltw(vector float __a, unsigned char __b)
+{
+  __b *= 4;
+  unsigned char b1=__b+1, b2=__b+2, b3=__b+3;
+  return vec_perm(__a, __a, (vector unsigned char)
+    (__b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai
+vec_splat_s8(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai
+vec_vspltisb(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai
+vec_splat_s16(signed char __a)
+{
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai
+vec_vspltish(signed char __a)
+{
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai
+vec_splat_s32(signed char __a)
+{
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai
+vec_vspltisw(signed char __a)
+{
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned char __ATTRS_o_ai
+vec_splat_u8(unsigned char __a)
+{
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned short __ATTRS_o_ai
+vec_splat_u16(signed char __a)
+{
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned int __ATTRS_o_ai
+vec_splat_u32(signed char __a)
+{
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static vector signed char __ATTRS_o_ai
+vec_sr(vector signed char __a, vector unsigned char __b)
+{
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sr(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a >> __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sr(vector short __a, vector unsigned short __b)
+{
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sr(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a >> __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sr(vector int __a, vector unsigned int __b)
+{
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sr(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a >> __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sr(vector signed long long __a, vector unsigned long long __b)
+{
+  return __a >> (vector long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sr(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __a >> __b;
+}
+#endif
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static vector signed char __ATTRS_o_ai
+vec_vsrb(vector signed char __a, vector unsigned char __b)
+{
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsrb(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static vector short __ATTRS_o_ai
+vec_vsrh(vector short __a, vector unsigned short __b)
+{
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsrh(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static vector int __ATTRS_o_ai
+vec_vsrw(vector int __a, vector unsigned int __b)
+{
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsrw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static vector signed char __ATTRS_o_ai
+vec_sra(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sra(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sra(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sra(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sra(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sra(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sra(vector signed long long __a, vector unsigned long long __b)
+{
+  return __a >> __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sra(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return (vector unsigned long long) ( (vector signed long long) __a >> __b);
+}
+#endif
+
+/* vec_vsrab */
+
+static vector signed char __ATTRS_o_ai
+vec_vsrab(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsrab(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static vector short __ATTRS_o_ai
+vec_vsrah(vector short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsrah(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static vector int __ATTRS_o_ai
+vec_vsraw(vector int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsraw(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_srl(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_srl(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_srl(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsr */
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned short __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned int __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned short __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned int __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned char __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned short __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned int __b)
+{
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned short __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsr(vector short __a, vector unsigned int __b)
+{
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned int __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned char __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned short __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned int __b)
+{
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned short __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsr(vector pixel __a, vector unsigned int __b)
+{
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned short __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsr(vector int __a, vector unsigned int __b)
+{
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned short __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned char __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned short __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned int __b)
+{
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+/* vec_sro */
+
+static vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sro(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_sro(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sro(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_sro(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sro(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sro(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_sro(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_sro(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsro */
+
+static vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector signed char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector unsigned char __b)
+{
+  return (vector signed char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector signed char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsro(vector short __a, vector signed char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsro(vector short __a, vector unsigned char __b)
+{
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector signed char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector unsigned char __b)
+{
+  return (vector unsigned short)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsro(vector pixel __a, vector signed char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai
+vec_vsro(vector pixel __a, vector unsigned char __b)
+{
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsro(vector int __a, vector signed char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsro(vector int __a, vector unsigned char __b)
+{
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector signed char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector unsigned char __b)
+{
+  return (vector unsigned int)
+           __builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsro(vector float __a, vector signed char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsro(vector float __a, vector unsigned char __b)
+{
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static void __ATTRS_o_ai
+vec_st(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_st(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static void __ATTRS_o_ai
+vec_stvx(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvx(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static void __ATTRS_o_ai
+vec_ste(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_ste(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static void __ATTRS_o_ai
+vec_stvebx(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvebx(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static void __ATTRS_o_ai
+vec_stvehx(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvehx(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static void __ATTRS_o_ai
+vec_stvewx(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvewx(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static void __ATTRS_o_ai
+vec_stl(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stl(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static void __ATTRS_o_ai
+vec_stvxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector signed char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, signed char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, unsigned char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector short __a, int __b, vector short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, unsigned short *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector int __a, int __b, vector int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, unsigned int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector float __a, int __b, vector float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvxl(vector float __a, int __b, float *__c)
+{
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector signed char __b)
+{
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector bool char __b)
+{
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector bool char __b)
+{
+  return __a - (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector short __a, vector short __b)
+{
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector short __b)
+{
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_sub(vector short __a, vector bool short __b)
+{
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector bool short __b)
+{
+  return __a - (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector int __a, vector int __b)
+{
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector int __b)
+{
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_sub(vector int __a, vector bool int __b)
+{
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector bool int __b)
+{
+  return __a - (vector unsigned int)__b;
+}
+
+static vector float __ATTRS_o_ai
+vec_sub(vector float __a, vector float __b)
+{
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector signed char __b)
+{
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector bool char __b)
+{
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector bool char __b)
+{
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector short __a, vector short __b)
+{
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector short __b)
+{
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubuhm(vector short __a, vector bool short __b)
+{
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector bool short __b)
+{
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector int __a, vector int __b)
+{
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector int __b)
+{
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubuwm(vector int __a, vector bool int __b)
+{
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector bool int __b)
+{
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b)
+{
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_subc(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_vsubcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_subs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_subs(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai
+vec_vsubshs(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_vsubsws(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_sum4s */
+
+static vector int __ATTRS_o_ai
+vec_sum4s(vector signed char __a, vector int __b)
+{
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_sum4s(vector unsigned char __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static vector int __ATTRS_o_ai
+vec_sum4s(vector signed short __a, vector int __b)
+{
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b)
+{
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b)
+{
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+             (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)
+    vec_perm(__c, __c, (vector unsigned char)
+             (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)
+    vec_perm(__b, __b, (vector unsigned char)
+             (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)
+    vec_perm(__c, __c, (vector unsigned char)
+             (4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b)
+{
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static vector float __attribute__((__always_inline__))
+vec_trunc(vector float __a)
+{
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_vrfiz */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a)
+{
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static vector short __ATTRS_o_ai
+vec_unpackh(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_unpackh(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_unpackh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_unpackh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_unpackh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsb */
+
+static vector short __ATTRS_o_ai
+vec_vupkhsb(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vupkhsb(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static vector int __ATTRS_o_ai
+vec_vupkhsh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vupkhsh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vupkhsh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_unpackl */
+
+static vector short __ATTRS_o_ai
+vec_unpackl(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_unpackl(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai
+vec_unpackl(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_unpackl(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_unpackl(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsb */
+
+static vector short __ATTRS_o_ai
+vec_vupklsb(vector signed char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vupklsb(vector bool char __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static vector int __ATTRS_o_ai
+vec_vupklsh(vector short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vupklsh(vector bool short __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vupklsh(vector pixel __a)
+{
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vsx_ld */
+
+#ifdef __VSX__
+
+static vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed int *__b)
+{
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned int *__b)
+{
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector float __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector float *__b)
+{
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed long long *__b)
+{
+  return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned long long *__b)
+{
+  return (vector unsigned long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector double *__b)
+{
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+#endif
+
+/* vec_vsx_st */
+
+#ifdef __VSX__
+
+static void __ATTRS_o_ai
+vec_vsx_st(vector signed int __a, int __b, vector signed int *__c)
+{
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_vsx_st(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_vsx_st(vector float __a, int __b, vector float *__c)
+{
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_vsx_st(vector signed long long __a, int __b, vector signed long long *__c)
+{
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_vsx_st(vector unsigned long long __a, int __b,
+           vector unsigned long long *__c)
+{
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_vsx_st(vector double __a, int __b, vector double *__c)
+{
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+#endif
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector signed char __b)
+{
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector bool char __b)
+{
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector bool char __b)
+{
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector bool char __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector short __a, vector short __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_xor(vector short __a, vector bool short __b)
+{
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector bool short __b)
+{
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector bool short __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector int __a, vector int __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_xor(vector int __a, vector bool int __b)
+{
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector bool int __b)
+{
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector bool int __b)
+{
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_xor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* vec_vxor */
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector signed char __b)
+{
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector signed char __b)
+{
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector bool char __b)
+{
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector unsigned char __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector unsigned char __b)
+{
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector bool char __b)
+{
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector bool char __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector short __a, vector short __b)
+{
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector short __b)
+{
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_vxor(vector short __a, vector bool short __b)
+{
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector unsigned short __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector unsigned short __b)
+{
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector bool short __b)
+{
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector bool short __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector int __a, vector int __b)
+{
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector int __b)
+{
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_vxor(vector int __a, vector bool int __b)
+{
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector unsigned int __b)
+{
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector unsigned int __b)
+{
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector bool int __b)
+{
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector bool int __b)
+{
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector float __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector float __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai
+vec_vxor(vector float __a, vector bool int __b)
+{
+  vector unsigned int __res = (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static signed char __ATTRS_o_ai
+vec_extract(vector signed char __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai
+vec_extract(vector unsigned char __a, int __b)
+{
+  return __a[__b];
+}
+
+static short __ATTRS_o_ai
+vec_extract(vector short __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai
+vec_extract(vector unsigned short __a, int __b)
+{
+  return __a[__b];
+}
+
+static int __ATTRS_o_ai
+vec_extract(vector int __a, int __b)
+{
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai
+vec_extract(vector unsigned int __a, int __b)
+{
+  return __a[__b];
+}
+
+static float __ATTRS_o_ai
+vec_extract(vector float __a, int __b)
+{
+  return __a[__b];
+}
+
+/* vec_insert */
+
+static vector signed char __ATTRS_o_ai
+vec_insert(signed char __a, vector signed char __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_insert(unsigned char __a, vector unsigned char __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector short __ATTRS_o_ai
+vec_insert(short __a, vector short __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector unsigned short __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector int __ATTRS_o_ai
+vec_insert(int __a, vector int __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_insert(unsigned int __a, vector unsigned int __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector float __ATTRS_o_ai
+vec_insert(float __a, vector float __b, int __c)
+{
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const signed char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const vector signed char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool char *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlx(int __a, const short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlx(int __a, const vector short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool short *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvlx(int __a, const vector pixel *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlx(int __a, const int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlx(int __a, const vector int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool int *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlx(int __a, const float *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlx(int __a, const vector float *__b)
+{
+  return vec_perm(vec_ld(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const signed char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool char *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlxl(int __a, const short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool short *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvlxl(int __a, const vector pixel *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlxl(int __a, const int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool int *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlxl(int __a, const float *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvlxl(int __a, vector float *__b)
+{
+  return vec_perm(vec_ldl(__a, __b),
+                  (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const vector signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool char *__b)
+{
+  return vec_perm((vector bool char)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrx(int __a, const short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrx(int __a, const vector short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool short *__b)
+{
+  return vec_perm((vector bool short)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvrx(int __a, const vector pixel *__b)
+{
+  return vec_perm((vector pixel)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrx(int __a, const int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrx(int __a, const vector int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool int *__b)
+{
+  return vec_perm((vector bool int)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrx(int __a, const float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrx(int __a, const vector float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b)
+{
+  return vec_perm((vector signed char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b)
+{
+  return vec_perm((vector unsigned char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool char *__b)
+{
+  return vec_perm((vector bool char)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrxl(int __a, const short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector short *__b)
+{
+  return vec_perm((vector short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b)
+{
+  return vec_perm((vector unsigned short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool short *__b)
+{
+  return vec_perm((vector bool short)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai
+vec_lvrxl(int __a, const vector pixel *__b)
+{
+  return vec_perm((vector pixel)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrxl(int __a, const int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector int *__b)
+{
+  return vec_perm((vector int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b)
+{
+  return vec_perm((vector unsigned int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool int *__b)
+{
+  return vec_perm((vector bool int)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrxl(int __a, const float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai
+vec_lvrxl(int __a, const vector float *__b)
+{
+  return vec_perm((vector float)(0),
+                  vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static void __ATTRS_o_ai
+vec_stvlx(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector short __a, int __b, short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector short __a, int __b, vector short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector int __a, int __b, int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector int __a, int __b, vector int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlx(vector float __a, int __b, vector float *__c)
+{
+  return vec_st(vec_perm(vec_lvrx(__b, __c),
+                         __a,
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+/* vec_stvlxl */
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector short __a, int __b, short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector short __a, int __b, vector short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector int __a, int __b, int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector int __a, int __b, vector int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvlxl(vector float __a, int __b, vector float *__c)
+{
+  return vec_stl(vec_perm(vec_lvrx(__b, __c),
+                          __a,
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+/* vec_stvrx */
+
+static void __ATTRS_o_ai
+vec_stvrx(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector short __a, int __b, short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector short __a, int __b, vector short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector int __a, int __b, int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector int __a, int __b, vector int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, __c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrx(vector float __a, int __b, vector float *__c)
+{
+  return vec_st(vec_perm(__a,
+                         vec_lvlx(__b, __c),
+                         vec_lvsr(__b, (unsigned char *)__c)),
+                __b, __c);
+}
+
+/* vec_stvrxl */
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector signed char __a, int __b, signed char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector signed char __a, int __b, vector signed char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned char __a, int __b, unsigned char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned char __a, int __b, vector unsigned char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool char __a, int __b, vector bool char *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector short __a, int __b, short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector short __a, int __b, vector short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned short __a, int __b, unsigned short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned short __a, int __b, vector unsigned short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool short __a, int __b, vector bool short *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector pixel __a, int __b, vector pixel *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector int __a, int __b, int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector int __a, int __b, vector int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned int __a, int __b, unsigned int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, __c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector unsigned int __a, int __b, vector unsigned int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector bool int __a, int __b, vector bool int *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+static void __ATTRS_o_ai
+vec_stvrxl(vector float __a, int __b, vector float *__c)
+{
+  return vec_stl(vec_perm(__a,
+                          vec_lvlx(__b, __c),
+                          vec_lvsr(__b, (unsigned char *)__c)),
+                 __b, __c);
+}
+
+/* vec_promote */
+
+static vector signed char __ATTRS_o_ai
+vec_promote(signed char __a, int __b)
+{
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_promote(unsigned char __a, int __b)
+{
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector short __ATTRS_o_ai
+vec_promote(short __a, int __b)
+{
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_promote(unsigned short __a, int __b)
+{
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector int __ATTRS_o_ai
+vec_promote(int __a, int __b)
+{
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_promote(unsigned int __a, int __b)
+{
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector float __ATTRS_o_ai
+vec_promote(float __a, int __b)
+{
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static vector signed char __ATTRS_o_ai
+vec_splats(signed char __a)
+{
+  return (vector signed char)(__a);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_splats(unsigned char __a)
+{
+  return (vector unsigned char)(__a);
+}
+
+static vector short __ATTRS_o_ai
+vec_splats(short __a)
+{
+  return (vector short)(__a);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_splats(unsigned short __a)
+{
+  return (vector unsigned short)(__a);
+}
+
+static vector int __ATTRS_o_ai
+vec_splats(int __a)
+{
+  return (vector int)(__a);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_splats(unsigned int __a)
+{
+  return (vector unsigned int)(__a);
+}
+
+static vector float __ATTRS_o_ai
+vec_splats(float __a)
+{
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static int __ATTRS_o_ai
+vec_all_eq(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector pixel __a, vector pixel __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_eq(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, 
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, 
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool long long __a, vector long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, 
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, 
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_eq(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, 
+                                      (vector long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_all_eq(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_ge */
+
+static int __ATTRS_o_ai
+vec_all_ge(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_ge(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
+}
+static int __ATTRS_o_ai
+vec_all_ge(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_ge(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_all_ge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_gt */
+
+static int __ATTRS_o_ai
+vec_all_gt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_gt(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
+}
+static int __ATTRS_o_ai
+vec_all_gt(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, 
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_gt(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_all_gt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+}
+
+/* vec_all_in */
+
+static int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_le(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, 
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_le(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_all_le(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+}
+
+/* vec_all_lt */
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_lt(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_all_lt(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_all_lt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+}
+
+/* vec_all_nan */
+
+static int __attribute__((__always_inline__))
+vec_all_nan(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+}
+
+/* vec_all_ne */
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector pixel __a, vector pixel __b)
+{
+  return
+    __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_all_ne(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a, 
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, 
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, 
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_all_ne(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, 
+                                      (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_all_ne(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_nge */
+
+static int __attribute__((__always_inline__))
+vec_all_nge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_ngt */
+
+static int __attribute__((__always_inline__))
+vec_all_ngt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_nle */
+
+static int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector pixel __a, vector pixel __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_eq(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a, 
+                                 (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector unsigned long long __a, vector bool long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool long long __a, vector signed long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool long long __a, vector unsigned long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_eq(vector bool long long __a, vector bool long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_any_eq(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_ge */
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned short __a, vector bool short __b)
+{
+  return
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_ge(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, 
+                                      (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_ge(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_any_ge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_gt */
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector unsigned short __b)
+{
+  return
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_gt(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+
+static int __ATTRS_o_ai
+vec_any_gt(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, 
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, 
+                                      (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_gt(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_any_gt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* vec_any_le */
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV,
+                                      (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV,
+                                      (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV,
+                                      (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_le(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, 
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, 
+                                      (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_le(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_any_le(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+}
+
+/* vec_any_lt */
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned char __a, vector bool char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector signed char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector unsigned char __b)
+{
+  return 
+    __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool char __a, vector bool char __b)
+{
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV,
+                                      (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned short __a, vector bool short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector unsigned short __b)
+{
+  return 
+    __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV,
+                                      (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector unsigned int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV,
+                                      (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_lt(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector unsigned long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, 
+                                      (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool long long __a, vector unsigned long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, 
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai
+vec_any_lt(vector bool long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_any_lt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+}
+
+/* vec_any_nan */
+
+static int __attribute__((__always_inline__))
+vec_any_nan(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector signed char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector unsigned char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool char __a, vector bool char __b)
+{
+  return
+    __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a, (vector char)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, 
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector unsigned short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool short __a, vector bool short __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector pixel __a, vector pixel __b)
+{
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV,
+                                      (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector int __a, vector int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector int __a, vector bool int __b)
+{
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector unsigned int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool int __a, vector bool int __b)
+{
+  return
+    __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai
+vec_any_ne(vector signed long long __a, vector signed long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a, 
+                                 (vector long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector signed long long __a, vector bool long long __b)
+{
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector unsigned long long __a, vector bool long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool long long __a, vector signed long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool long long __a, vector unsigned long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai
+vec_any_ne(vector bool long long __a, vector bool long long __b)
+{
+  return
+    __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, 
+                                 (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai
+vec_any_ne(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nge */
+
+static int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a)
+{
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b)
+{
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* Power 8 Crypto functions
+Note: We diverge from the current GCC implementation with regard
+to cryptography and related functions as follows:
+- Only the SHA and AES instructions and builtins are disabled by -mno-crypto
+- The remaining ones are only available on Power8 and up so
+  require -mpower8-vector
+The justification for this is that export requirements require that
+Category:Vector.Crypto is optional (i.e. compliant hardware may not provide
+support). As a result, we need to be able to turn off support for those.
+The remaining ones (currently controlled by -mcrypto for GCC) still
+need to be provided on compliant hardware even if Vector.Crypto is not
+provided.
+FIXME: the naming convention for the builtins will be adjusted due
+to the inconsistency (__builtin_crypto_ prefix on builtins that cannot be
+removed with -mno-crypto). This is under development.
+*/
+#ifdef __CRYPTO__
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vsbox (vector unsigned long long __a)
+{
+  return __builtin_altivec_crypto_vsbox(__a);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipher (vector unsigned long long __a,
+                          vector unsigned long long __b)
+{
+  return __builtin_altivec_crypto_vcipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast (vector unsigned long long __a,
+                              vector unsigned long long __b)
+{
+  return __builtin_altivec_crypto_vcipherlast(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipher (vector unsigned long long __a,
+                           vector unsigned long long __b)
+{
+  return __builtin_altivec_crypto_vncipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast (vector unsigned long long __a,
+                               vector unsigned long long __b)
+{
+  return __builtin_altivec_crypto_vncipherlast(__a, __b);
+}
+
+
+#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
+#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned char __a,
+                           vector unsigned char __b,
+                           vector unsigned char __c)
+{
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned short __a,
+                           vector unsigned short __b,
+                           vector unsigned short __c)
+{
+  return (vector unsigned short)
+          __builtin_altivec_crypto_vpermxor((vector unsigned char) __a,
+                                             (vector unsigned char) __b,
+                                             (vector unsigned char) __c);
+}
+
+static vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned int __a,
+                           vector unsigned int __b,
+                           vector unsigned int __c)
+{
+  return (vector unsigned int)
+          __builtin_altivec_crypto_vpermxor((vector unsigned char) __a,
+                                              (vector unsigned char) __b,
+                                              (vector unsigned char) __c);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpermxor (vector unsigned long long __a,
+                           vector unsigned long long __b,
+                           vector unsigned long long __c)
+{
+  return (vector unsigned long long)
+          __builtin_altivec_crypto_vpermxor((vector unsigned char) __a,
+                                              (vector unsigned char) __b,
+                                              (vector unsigned char) __c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned char __a,
+                          vector unsigned char __b)
+{
+  return __builtin_altivec_crypto_vpmsumb(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned short __a,
+                          vector unsigned short __b)
+{
+  return __builtin_altivec_crypto_vpmsumh(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned int __a,
+                          vector unsigned int __b)
+{
+  return __builtin_altivec_crypto_vpmsumw(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb (vector unsigned long long __a,
+                          vector unsigned long long __b)
+{
+  return __builtin_altivec_crypto_vpmsumd(__a, __b);
+}
+#endif
+
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */

diff --git a/23.0.3/clang-include/ammintrin.h b/23.0.3/clang-include/ammintrin.h
new file mode 100644
index 0000000..d87b9cd
--- /dev/null
+++ b/23.0.3/clang-include/ammintrin.h

@@ -0,0 +1,68 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#ifndef __SSE4A__
+#error "SSE4A instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#endif /* __SSE4A__ */
+
+#endif /* __AMMINTRIN_H */

diff --git a/23.0.3/clang-include/arm_acle.h b/23.0.3/clang-include/arm_acle.h
new file mode 100644
index 0000000..6c56f3b
--- /dev/null
+++ b/23.0.3/clang-include/arm_acle.h

@@ -0,0 +1,296 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 8.3 Memory barriers */
+#if !defined(_MSC_VER)
+#define __dmb(i) __builtin_arm_dmb(i)
+#define __dsb(i) __builtin_arm_dsb(i)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 8.4 Hints */
+
+#if !defined(_MSC_VER)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+/* 8.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __swp(uint32_t x, volatile uint32_t *p) {
+  uint32_t v;
+  do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
+  return v;
+}
+
+/* 8.6 Memory prefetch intrinsics */
+/* 8.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 8.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 8.7 NOP */
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+
+/* 9 DATA-PROCESSING INTRINSICS */
+/* 9.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __ror(uint32_t x, uint32_t y) {
+  y %= 32;
+  if (y == 0)  return x;
+  return (x >> y) | (x << (32 - y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rorll(uint64_t x, uint32_t y) {
+  y %= 64;
+  if (y == 0)  return x;
+  return (x >> y) | (x << (64 - y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rorl(unsigned long x, uint32_t y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(x, y);
+#else
+  return __rorll(x, y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __clz(uint32_t t) {
+  return __builtin_clz(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __clzl(unsigned long t) {
+  return __builtin_clzl(t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __clzll(uint64_t t) {
+  return __builtin_clzll(t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rev(uint32_t t) {
+  return __builtin_bswap32(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __revl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(t);
+#else
+  return __builtin_bswap64(t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __revll(uint64_t t) {
+  return __builtin_bswap64(t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rev16(uint32_t t) {
+  return __ror(__rev(t), 16);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rev16l(unsigned long t) {
+    return __rorl(__revl(t), sizeof(long) / 2);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rev16ll(uint64_t t) {
+  return __rorll(__revll(t), 32);
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+  __revsh(int16_t t) {
+  return __builtin_bswap16(t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rbit(uint32_t t) {
+  return __builtin_arm_rbit(t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rbitll(uint64_t t) {
+#if __ARM_32BIT_STATE
+  return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
+    __builtin_arm_rbit(t >> 32);
+#else
+  return __builtin_arm_rbit64(t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rbitl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(t);
+#else
+  return __rbitll(t);
+#endif
+}
+
+/*
+ * 9.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 9.4.1 Width-specified saturation intrinsics */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 9.4.2 Saturating addition and subtraction intrinsics */
+#if __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+  __qadd(int32_t t, int32_t v) {
+  return __builtin_arm_qadd(t, v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+  __qsub(int32_t t, int32_t v) {
+  return __builtin_arm_qsub(t, v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t t) {
+  return __builtin_arm_qadd(t, t);
+}
+#endif
+
+/* 9.7 CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32b(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32b(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32h(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32h(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32w(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32w(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32d(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32d(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cb(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32cb(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32ch(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32ch(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cw(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32cw(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cd(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32cd(a, b);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */

diff --git a/23.0.3/clang-include/avx2intrin.h b/23.0.3/clang-include/avx2intrin.h
new file mode 100644
index 0000000..949195b
--- /dev/null
+++ b/23.0.3/clang-include/avx2intrin.h

@@ -0,0 +1,1241 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a + (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a + (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a + (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
+  __m256i __a = (a); \
+  __m256i __b = (b); \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \
+                                   (((M) & 0x01) ? 16 : 0), \
+                                   (((M) & 0x02) ? 17 : 1), \
+                                   (((M) & 0x04) ? 18 : 2), \
+                                   (((M) & 0x08) ? 19 : 3), \
+                                   (((M) & 0x10) ? 20 : 4), \
+                                   (((M) & 0x20) ? 21 : 5), \
+                                   (((M) & 0x40) ? 22 : 6), \
+                                   (((M) & 0x80) ? 23 : 7), \
+                                   (((M) & 0x01) ? 24 : 8), \
+                                   (((M) & 0x02) ? 25 : 9), \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a == __b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a > (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a > __b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
+}
+
+static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a * (__v16hi)__b);
+}
+
+static __inline__  __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a * (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)_mm256_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) & 0x03) >> 0), \
+                                   12 + (((imm) & 0x0c) >> 2), \
+                                   12 + (((imm) & 0x30) >> 4), \
+                                   12 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \
+                                   (imm) & 0x3,((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) & 0x03) >> 0), \
+                                   8 + (((imm) & 0x0c) >> 2), \
+                                   8 + (((imm) & 0x30) >> 4), \
+                                   8 + (((imm) & 0xc0) >> 6), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, count) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, count) __extension__ ({ \
+  __m256i __a = (a); \
+  (__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256(__a, __count);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a - (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a - (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a - (__v8si)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_load_si256(__m256i *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__V);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  __m128i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m128i)__builtin_shufflevector((__v4si)__V1, (__v4si)__V2, \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_ia32_pbroadcastq256(__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X);
+}
+
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_ia32_pbroadcastq128(__X);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  __m256d __V = (V); \
+  (__m256d)__builtin_shufflevector((__v4df)__V, (__v4df) _mm256_setzero_pd(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar8x32_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8sf)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  __m256i __V = (V); \
+  (__m256i)__builtin_shufflevector((__v4di)__V, (__v4di) _mm256_setzero_si256(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); })
+
+#define _mm256_extracti128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector( \
+    (__v4di)(V), \
+    (__v4di)(_mm256_setzero_si256()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector( \
+    (__v4di)(V1), \
+    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m128d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m128d __mask = (mask); \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)__a, (const __v2df *)__m, \
+             (__v4si)__i, (__v2df)__mask, (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m256d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m256d __mask = (mask); \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)__a, (const __v4df *)__m, \
+             (__v4si)__i, (__v4df)__mask, (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m128d __a = (a); \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  __m128d __mask = (mask); \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)__a, (const __v2df *)__m, \
+             (__v2di)__i, (__v2df)__mask, (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  __m256d __a = (a); \
+  double const *__m = (m); \
+  __m256i __i = (i); \
+  __m256d __mask = (mask); \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)__a, (const __v4df *)__m, \
+             (__v4di)__i, (__v4df)__mask, (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)__a, (const __v4sf *)__m, \
+            (__v4si)__i, (__v4sf)__mask, (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m256 __a = (a); \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  __m256 __mask = (mask); \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)__a, (const __v8sf *)__m, \
+            (__v8si)__i, (__v8sf)__mask, (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)__a, (const __v4sf *)__m, \
+            (__v2di)__i, (__v4sf)__mask, (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  __m128 __a = (a); \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  __m128 __mask = (mask); \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)__a, (const __v4sf *)__m, \
+            (__v4di)__i, (__v4sf)__mask, (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)__a, (const __v4si *)__m, \
+            (__v4si)__i, (__v4si)__mask, (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)__a, (const __v8si *)__m, \
+            (__v8si)__i, (__v8si)__mask, (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)__a, (const __v4si *)__m, \
+            (__v2di)__i, (__v4si)__mask, (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)__a, (const __v4si *)__m, \
+            (__v4di)__i, (__v4si)__mask, (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)__a, (const __v2di *)__m, \
+             (__v4si)__i, (__v2di)__mask, (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)__a, (const __v4di *)__m, \
+             (__v4si)__i, (__v4di)__mask, (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m128i __a = (a); \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  __m128i __mask = (mask); \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)__a, (const __v2di *)__m, \
+             (__v2di)__i, (__v2di)__mask, (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  __m256i __a = (a); \
+  long long const *__m = (m); \
+  __m256i __i = (i); \
+  __m256i __mask = (mask); \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)__a, (const __v4di *)__m, \
+             (__v4di)__i, (__v4di)__mask, (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_setzero_pd(), \
+             (const __v2df *)__m, (__v4si)__i, \
+             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_setzero_pd(), \
+             (const __v4df *)__m, (__v4si)__i, \
+             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_setzero_pd(), \
+             (const __v2df *)__m, (__v2di)__i, \
+             (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  double const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_setzero_pd(), \
+             (const __v4df *)__m, (__v4di)__i, \
+             (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v4si)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_setzero_ps(), \
+             (const __v8sf *)__m, (__v8si)__i, \
+             (__v8sf)_mm256_set1_ps((float)(int)-1), (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v2di)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  float const *__m = (m); \
+  __m256i __i = (i); \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_setzero_ps(), \
+             (const __v4sf *)__m, (__v4di)__i, \
+             (__v4sf)_mm_set1_ps((float)(int)-1), (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v4si)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_setzero_si256(), \
+            (const __v8si *)__m, (__v8si)__i, \
+            (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v2di)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  int const *__m = (m); \
+  __m256i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_setzero_si128(), \
+            (const __v4si *)__m, (__v4di)__i, \
+            (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_setzero_si128(), \
+             (const __v2di *)__m, (__v4si)__i, \
+             (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_setzero_si256(), \
+             (const __v4di *)__m, (__v4si)__i, \
+             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m128i __i = (i); \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_setzero_si128(), \
+             (const __v2di *)__m, (__v2di)__i, \
+             (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  long long const *__m = (m); \
+  __m256i __i = (i); \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_setzero_si256(), \
+             (const __v4di *)__m, (__v4di)__i, \
+             (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#endif /* __AVX2INTRIN_H */

diff --git a/23.0.3/clang-include/avx512bwintrin.h b/23.0.3/clang-include/avx512bwintrin.h
new file mode 100644
index 0000000..acc3da2
--- /dev/null
+++ b/23.0.3/clang-include/avx512bwintrin.h

@@ -0,0 +1,367 @@
+/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BWINTRIN_H
+#define __AVX512BWINTRIN_H
+
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+
+
+/* Integer compare */
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 __u);
+}
+
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), \
+                                         (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), \
+                                         (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), \
+                                          (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), \
+                                          (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), \
+                                         (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), \
+                                         (p), (__mmask32)(m)); })
+
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), \
+                                          (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), \
+                                          (p), (__mmask32)(m)); })
+
+#endif

diff --git a/23.0.3/clang-include/avx512erintrin.h b/23.0.3/clang-include/avx512erintrin.h
new file mode 100644
index 0000000..57c61aa
--- /dev/null
+++ b/23.0.3/clang-include/avx512erintrin.h

@@ -0,0 +1,286 @@
+/*===---- avx512fintrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(S), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_pd(A) \
+   _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+   _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+   _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(S), \
+                                     (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_ps(A) \
+   _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+   _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+   _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+// rsqrt28
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(S), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(S), \
+                                        (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(M), (R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(S), \
+                                        (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(S), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+// rcp28
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), \
+                                      (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (R)); })
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)_mm_setzero_ps(), \
+                                      (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)(__m128)(S), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)_mm_setzero_ps(), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)_mm_setzero_pd(), \
+                                       (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)(__m128d)(S), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)_mm_setzero_pd(), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif // __AVX512ERINTRIN_H

diff --git a/23.0.3/clang-include/avx512fintrin.h b/23.0.3/clang-include/avx512fintrin.h
new file mode 100644
index 0000000..72af281
--- /dev/null
+++ b/23.0.3/clang-include/avx512fintrin.h

@@ -0,0 +1,1531 @@
+/*===---- avx512fintrin.h - AVX2 intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FINTRIN_H
+#define __AVX512FINTRIN_H
+
+typedef double __v8df __attribute__((__vector_size__(64)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef long long __v8di __attribute__((__vector_size__(64)));
+typedef int __v16si __attribute__((__vector_size__(64)));
+
+typedef float __m512 __attribute__((__vector_size__(64)));
+typedef double __m512d __attribute__((__vector_size__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+/* Rounding mode macros.  */
+#define _MM_FROUND_TO_NEAREST_INT   0x00
+#define _MM_FROUND_TO_NEG_INF       0x01
+#define _MM_FROUND_TO_POS_INF       0x02
+#define _MM_FROUND_TO_ZERO          0x03
+#define _MM_FROUND_CUR_DIRECTION    0x04
+
+/* Create vectors with repeated elements */
+
+static  __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_setzero_si512(void)
+{
+  return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+#ifdef __x86_64__
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#endif
+}
+
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_setzero_ps(void)
+{
+  return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+static  __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_setzero_pd(void)
+{
+  return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_set1_ps(float __w)
+{
+  return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+                   __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_set1_pd(double __w)
+{
+  return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_set1_epi32(int __s)
+{
+  return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
+                             __s, __s, __s, __s, __s, __s, __s, __s };
+}
+
+static __inline __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_set1_epi64(long long __d)
+{
+  return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+}
+
+static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_broadcastss_ps(__m128 __X)
+{
+  float __f = __X[0];
+  return (__v16sf){ __f, __f, __f, __f,
+                    __f, __f, __f, __f,
+                    __f, __f, __f, __f,
+                    __f, __f, __f, __f };
+}
+
+static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_broadcastsd_pd(__m128d __X)
+{
+  double __d = __X[0];
+  return (__v8df){ __d, __d, __d, __d,
+                   __d, __d, __d, __d };
+}
+
+/* Cast between vector types */
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_castpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_castps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm512_castpd512_pd128(__m512d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm512_castps512_ps128(__m512 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+/* Bitwise operators */
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+/* Arithmetic */
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_add_pd(__m512d __a, __m512d __b)
+{
+  return __a + __b;
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_add_ps(__m512 __a, __m512 __b)
+{
+  return __a + __b;
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_mul_pd(__m512d __a, __m512d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_mul_ps(__m512 __a, __m512 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_sub_pd(__m512d __a, __m512d __b)
+{
+  return __a - __b;
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_sub_ps(__m512 __a, __m512 __b)
+{
+  return __a - __b;
+}
+
+static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_max_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_max_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i
+__attribute__ ((__always_inline__, __nodebug__))
+_mm512_max_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_max_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_max_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_max_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_min_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_min_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i
+__attribute__ ((__always_inline__, __nodebug__))
+_mm512_min_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_min_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_min_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_min_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mul_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mul_epu32(__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di)
+               _mm512_setzero_si512 (),
+               (__mmask8) -1);
+}
+
+static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_sqrt_pd(__m512d a)
+{
+  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)a,
+                                                (__v8df) _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_sqrt_ps(__m512 a)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)a,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_rsqrt14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                 (__v8df)
+                 _mm512_setzero_pd (),
+                 (__mmask8) -1);}
+
+static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_rsqrt14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1);
+}
+
+static  __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) -1);
+}
+
+static  __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static  __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_rcp14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+               (__v8df)
+               _mm512_setzero_pd (),
+               (__mmask8) -1);
+}
+
+static  __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_rcp14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+              (__v16sf)
+              _mm512_setzero_ps (),
+              (__mmask16) -1);
+}
+static  __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) -1);
+}
+
+static  __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_rcp14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) -1);
+}
+
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_floor_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_floor_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_FLOOR,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_ceil_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_ceil_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_CEIL,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __attribute__ (( __always_inline__, __nodebug__))
+_mm512_abs_epi64(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline __m512i __attribute__ (( __always_inline__, __nodebug__))
+_mm512_abs_epi32(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \
+                                         -1, _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \
+                                          -1, _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d)
+    __builtin_ia32_vfmaddpd512_mask(__A,
+                                    __B,
+                                    __C,
+                                    (__mmask8) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d)
+    __builtin_ia32_vfmsubpd512_mask(__A,
+                                    __B,
+                                    __C,
+                                    (__mmask8) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d)
+    __builtin_ia32_vfnmaddpd512_mask(__A,
+                                     __B,
+                                     __C,
+                                     (__mmask8) -1,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512)
+    __builtin_ia32_vfmaddps512_mask(__A,
+                                    __B,
+                                    __C,
+                                    (__mmask16) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512)
+    __builtin_ia32_vfmsubps512_mask(__A,
+                                    __B,
+                                    __C,
+                                    (__mmask16) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512)
+    __builtin_ia32_vfnmaddps512_mask(__A,
+                                     __B,
+                                     __C,
+                                     (__mmask16) -1,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+/* Vector permutations */
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16si) __A,
+                                                       (__v16si) __B,
+                                                       (__mmask16) -1);
+}
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) -1);
+}
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__mmask8) -1);
+}
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__mmask16) -1);
+}
+
+#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
+                                         (__v8di)(__m512i)(B), \
+                                         (I), (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)-1); })
+
+#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+                                         (__v16si)(__m512i)(B), \
+                                         (I), (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)-1); })
+
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
+      __m512d __A = (A);                                                 \
+      (__m256d)                                                          \
+        __builtin_ia32_extractf64x4_mask((__v8df)__A,                    \
+                                         (I),                            \
+                                         (__v4df)_mm256_setzero_si256(), \
+                                         (__mmask8) -1); })
+
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
+      __m512 __A = (A);                                                  \
+      (__m128)                                                           \
+        __builtin_ia32_extractf32x4_mask((__v16sf)__A,                   \
+                                         (I),                            \
+                                         (__v4sf)_mm_setzero_ps(),       \
+                                         (__mmask8) -1); })
+
+/* Vector Blend */
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+/* Compare */
+
+#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (P), (__mmask16)-1, (R)); })
+
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (P), (__mmask16)(U), (R)); })
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (P), (__mmask8)-1, (R)); })
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (P), (__mmask8)(U), (R)); })
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+/* Conversion */
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_cvttps_epu32(__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (R)); })
+
+static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
+_mm512_cvtepi32_pd(__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline __m512d __attribute__ (( __always_inline__, __nodebug__))
+_mm512_cvtepu32_pd(__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) -1);
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (R)); })
+
+#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(A), (I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            -1); })
+
+static  __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_cvtph_ps(__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __attribute__((__always_inline__, __nodebug__))
+_mm512_cvttps_epi32(__m512 a)
+{
+  return (__m512i)
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) a,
+                                     (__v16si) _mm512_setzero_si512 (),
+                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm512_cvttpd_epi32(__m512d a)
+{
+  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) a,
+                                                   (__v8si)_mm256_setzero_si256(),
+                                                   (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (R)); })
+
+#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)-1, (R)); })
+
+#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8) -1, (R)); })
+
+/* Unpack and Interleave */
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 2,    18,    3,    19,
+                                 2+4,  18+4,  3+4,  19+4,
+                                 2+8,  18+8,  3+8,  19+8,
+                                 2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 0,    16,    1,    17,
+                                 0+4,  16+4,  1+4,  17+4,
+                                 0+8,  16+8,  1+8,  17+8,
+                                 0+12, 16+12, 1+12, 17+12);
+}
+
+/* Bit Test */
+
+static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_test_epi32_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+            (__v16si) __B,
+            (__mmask16) -1);
+}
+
+static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_test_epi64_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+                 (__v8di) __B,
+                 (__mmask8) -1);
+}
+
+/* SIMD load ops */
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U);
+}
+
+static __inline __m512i __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __attribute__ ((__always_inline__, __nodebug__))
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m512d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m512 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_load_ps(double const *__p)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_load_pd(float const *__p)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+/* SIMD store ops */
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A,
+                                     (__mmask8) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A,
+                                     (__mmask16) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_storeu_pd(void *__P, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_storeu_ps(void *__P, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_store_pd(void *__P, __m512d __A)
+{
+  *(__m512d*)__P = __A;
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __attribute__ ((__always_inline__, __nodebug__))
+_mm512_store_ps(void *__P, __m512 __A)
+{
+  *(__m512*)__P = __A;
+}
+
+/* Mask ops */
+
+static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
+_mm512_knot(__mmask16 __M)
+{
+  return __builtin_ia32_knothi(__M);
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                __u);
+}
+
+#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+                                         (__mmask16)-1); })
+
+#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+                                          (__mmask16)-1); })
+
+#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+                                        (__mmask8)-1); })
+
+#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+                                         (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \
+                                          (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+                                        (__mmask8)(m)); })
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  __m512i __a = (a); \
+  __m512i __b = (b); \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \
+                                         (__mmask8)(m)); })
+#endif // __AVX512FINTRIN_H

diff --git a/23.0.3/clang-include/avx512vlbwintrin.h b/23.0.3/clang-include/avx512vlbwintrin.h
new file mode 100644
index 0000000..0746f43
--- /dev/null
+++ b/23.0.3/clang-include/avx512vlbwintrin.h

@@ -0,0 +1,689 @@
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ----------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBWINTRIN_H
+#define __AVX512VLBWINTRIN_H
+
+/* Integer compare */
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 __u);
+}
+
+#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), \
+                                         (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), \
+                                         (p), (__mmask16)(m)); })
+
+#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), \
+                                          (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), \
+                                          (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), \
+                                         (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), \
+                                         (p), (__mmask32)(m)); })
+
+#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), \
+                                          (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), \
+                                          (p), (__mmask32)(m)); })
+
+#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), \
+                                         (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), \
+                                         (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), \
+                                          (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), \
+                                          (p), (__mmask16)(m)); })
+
+#endif /* __AVX512VLBWINTRIN_H */

diff --git a/23.0.3/clang-include/avx512vlintrin.h b/23.0.3/clang-include/avx512vlintrin.h
new file mode 100644
index 0000000..b460992
--- /dev/null
+++ b/23.0.3/clang-include/avx512vlintrin.h

@@ -0,0 +1,693 @@
+/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLINTRIN_H
+#define __AVX512VLINTRIN_H
+
+/* Integer compare */
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                __u);
+}
+
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                __u);
+}
+
+
+
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__))
+_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                __u);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#endif /* __AVX512VLINTRIN_H */

diff --git a/23.0.3/clang-include/avxintrin.h b/23.0.3/clang-include/avxintrin.h
new file mode 100644
index 0000000..f30a5ad
--- /dev/null
+++ b/23.0.3/clang-include/avxintrin.h

@@ -0,0 +1,1273 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Arithmetic */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    __m256d __V = (V); \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
+
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  __m256 __V = (V); \
+  (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
+
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a | (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a | (__v8si)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a ^ (__v4di)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a ^ (__v8si)__b);
+}
+
+/* Horizontal arithmetic */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
+}
+
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  __m128d __A = (A); \
+  (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  __m256d __A = (A); \
+  (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1, \
+                                   2 + (((C) & 0x4) >> 2), \
+                                   2 + (((C) & 0x8) >> 3)); })
+
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  __m128 __A = (A); \
+  (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
+                                   (C) & 0x3, ((C) & 0xc) >> 2, \
+                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  __m256 __A = (A); \
+  (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
+                                  (C) & 0x3, ((C) & 0xc) >> 2, \
+                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
+                                  4 + (((C) & 0x03) >> 0), \
+                                  4 + (((C) & 0x0c) >> 2), \
+                                  4 + (((C) & 0x30) >> 4), \
+                                  4 + (((C) & 0xc0) >> 6)); })
+
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m256d __V2 = (V2); \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
+
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  __m256i __V1 = (V1); \
+  __m256i __V2 = (V2); \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
+
+/* Vector Blend */
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  __m256d __V1 = (V1); \
+  __m256d __V2 = (V2); \
+  (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  __m256 __V1 = (V1); \
+  __m256 __V2 = (V2); \
+  (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
+
+/* Vector shuffle */
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+        __m256 __a = (a); \
+        __m256 __b = (b); \
+        (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
+        (mask) & 0x3,                ((mask) & 0xc) >> 2, \
+        (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
+        ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
+        (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
+
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+        __m256d __a = (a); \
+        __m256d __b = (b); \
+        (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
+        (mask) & 0x1, \
+        (((mask) & 0x2) >> 1) + 4, \
+        (((mask) & 0x4) >> 2) + 2, \
+        (((mask) & 0x8) >> 3) + 6); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
+
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
+
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  __m256d __a = (a); \
+  __m256d __b = (b); \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
+
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  __m256 __a = (a); \
+  __m256 __b = (b); \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
+
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  __m128d __a = (a); \
+  __m128d __b = (b); \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
+
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi32(__m256i __a, const int __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi16(__m256i __a, const int __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return __b[__imm & 15];
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi8(__m256i __a, const int __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return __b[__imm & 31];
+}
+
+#ifdef __x86_64__
+static __inline long long  __attribute__((__always_inline__, __nodebug__))
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+/* Vector replicate */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
+}
+
+/* SIMD load ops */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  __builtin_ia32_storeupd256(__p, (__v4df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_storeups256(__p, (__v8sf)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
+}
+
+/* Conditional load ops */
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_pd(double const *__p, __m128d __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_pd(double const *__p, __m256d __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4df)__m);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maskload_ps(float const *__p, __m128 __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maskload_ps(float const *__p, __m256 __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
+}
+
+/* Conditional store ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+}
+
+/* Create vectors */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+              float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+                 int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+                 short __w11, short __w10, short __w09, short __w08,
+                 short __w07, short __w06, short __w05, short __w04,
+                 short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+                char __b27, char __b26, char __b25, char __b24,
+                char __b23, char __b22, char __b21, char __b20,
+                char __b19, char __b18, char __b17, char __b16,
+                char __b15, char __b14, char __b13, char __b12,
+                char __b11, char __b10, char __b09, char __b08,
+                char __b07, char __b06, char __b05, char __b04,
+                char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+               float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+                  int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+       short __w11, short __w10, short __w09, short __w08,
+       short __w07, short __w06, short __w05, short __w04,
+       short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+                 char __b27, char __b26, char __b25, char __b24,
+                 char __b23, char __b22, char __b21, char __b20,
+                 char __b19, char __b18, char __b17, char __b16,
+                 char __b15, char __b14, char __b13, char __b12,
+                 char __b11, char __b10, char __b09, char __b08,
+                 char __b07, char __b06, char __b05, char __b04,
+                 char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+/* 
+   Vector insert.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector( \
+    (__v8sf)(V1), \
+    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
+    (((M) & 1) ?  0 :  8), \
+    (((M) & 1) ?  1 :  9), \
+    (((M) & 1) ?  2 : 10), \
+    (((M) & 1) ?  3 : 11), \
+    (((M) & 1) ?  8 :  4), \
+    (((M) & 1) ?  9 :  5), \
+    (((M) & 1) ? 10 :  6), \
+    (((M) & 1) ? 11 :  7) );})
+
+#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector( \
+    (__v4df)(V1), \
+    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector( \
+    (__v4di)(V1), \
+    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/* 
+   Vector extract.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+#define _mm256_extractf128_ps(V, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector( \
+    (__v8sf)(V), \
+    (__v8sf)(_mm256_setzero_ps()), \
+    (((M) & 1) ? 4 : 0), \
+    (((M) & 1) ? 5 : 1), \
+    (((M) & 1) ? 6 : 2), \
+    (((M) & 1) ? 7 : 3) );})
+
+#define _mm256_extractf128_pd(V, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector( \
+    (__v4df)(V), \
+    (__v4df)(_mm256_setzero_pd()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+#define _mm256_extractf128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector( \
+    (__v4di)(V), \
+    (__v4di)(_mm256_setzero_si256()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/* SIMD load ops (unaligned) */
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
+  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  
+  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
+  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  __m256i __v256 = _mm256_castsi128_si256(
+    ((struct __loadu_si128*)__addr_lo)->__v);
+  return _mm256_insertf128_si256(__v256,
+                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
+}
+
+/* SIMD store ops (unaligned) */
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  __builtin_ia32_storeups(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  __builtin_ia32_storeups(__addr_hi, __v128);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  __builtin_ia32_storeupd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  __builtin_ia32_storeupd(__addr_hi, __v128);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
+}
+
+#endif /* __AVXINTRIN_H */

diff --git a/23.0.3/clang-include/bmi2intrin.h b/23.0.3/clang-include/bmi2intrin.h
new file mode 100644
index 0000000..a05cfad
--- /dev/null
+++ b/23.0.3/clang-include/bmi2intrin.h

@@ -0,0 +1,94 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#endif /* __BMI2INTRIN_H */

diff --git a/23.0.3/clang-include/bmiintrin.h b/23.0.3/clang-include/bmiintrin.h
new file mode 100644
index 0000000..0e5fd55
--- /dev/null
+++ b/23.0.3/clang-include/bmiintrin.h

@@ -0,0 +1,148 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI__
+# error "BMI instruction set not enabled"
+#endif /* __BMI__ */
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+#define _blsr_u32(a)      (__blsr_u32((a)))
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u16(unsigned short __X)
+{
+  return __X ? __builtin_ctzs(__X) : 16;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+#define _blsr_u64(a)      (__blsr_u64((a)))
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__tzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+#endif /* __x86_64__ */
+
+#endif /* __BMIINTRIN_H */

diff --git a/23.0.3/clang-include/cpuid.h b/23.0.3/clang-include/cpuid.h
new file mode 100644
index 0000000..5da02e0
--- /dev/null
+++ b/23.0.3/clang-include/cpuid.h

@@ -0,0 +1,209 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Responses identification request with %eax 0 */
+/* AMD:     "AuthenticAMD" */
+#define signature_AMD_ebx 0x68747541
+#define signature_AMD_edx 0x69746e65
+#define signature_AMD_ecx 0x444d4163
+/* CENTAUR: "CentaurHauls" */
+#define signature_CENTAUR_ebx 0x746e6543
+#define signature_CENTAUR_edx 0x48727561
+#define signature_CENTAUR_ecx 0x736c7561
+/* CYRIX:   "CyrixInstead" */
+#define signature_CYRIX_ebx 0x69727943
+#define signature_CYRIX_edx 0x736e4978
+#define signature_CYRIX_ecx 0x64616574
+/* INTEL:   "GenuineIntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+/* TM1:     "TransmetaCPU" */
+#define signature_TM1_ebx 0x6e617254
+#define signature_TM1_edx 0x74656d73
+#define signature_TM1_ecx 0x55504361
+/* TM2:     "GenuineTMx86" */
+#define signature_TM2_ebx 0x756e6547
+#define signature_TM2_edx 0x54656e69
+#define signature_TM2_ecx 0x3638784d
+/* NSC:     "Geode by NSC" */
+#define signature_NSC_ebx 0x646f6547
+#define signature_NSC_edx 0x43534e20
+#define signature_NSC_ecx 0x79622065
+/* NEXGEN:  "NexGenDriven" */
+#define signature_NEXGEN_ebx 0x4778654e
+#define signature_NEXGEN_edx 0x72446e65
+#define signature_NEXGEN_ecx 0x6e657669
+/* RISE:    "RiseRiseRise" */
+#define signature_RISE_ebx 0x65736952
+#define signature_RISE_edx 0x65736952
+#define signature_RISE_ecx 0x65736952
+/* SIS:     "SiS SiS SiS " */
+#define signature_SIS_ebx 0x20536953
+#define signature_SIS_edx 0x20536953
+#define signature_SIS_ecx 0x20536953
+/* UMC:     "UMC UMC UMC " */
+#define signature_UMC_ebx 0x20434d55
+#define signature_UMC_edx 0x20434d55
+#define signature_UMC_ecx 0x20434d55
+/* VIA:     "VIA VIA VIA " */
+#define signature_VIA_ebx 0x20414956
+#define signature_VIA_edx 0x20414956
+#define signature_VIA_ecx 0x20414956
+/* VORTEX:  "Vortex86 SoC" */
+#define signature_VORTEX_ebx 0x74726f56
+#define signature_VORTEX_edx 0x36387865
+#define signature_VORTEX_ecx 0x436f5320
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE42       0x00100000
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_RDRND       0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}

diff --git a/23.0.3/clang-include/emmintrin.h b/23.0.3/clang-include/emmintrin.h
new file mode 100644
index 0000000..c764d68
--- /dev/null
+++ b/23.0.3/clang-include/emmintrin.h

@@ -0,0 +1,1475 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#ifndef __SSE2__
+#error "SSE2 instruction set not enabled"
+#else
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd(__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pd(__m128 __a)
+{
+  return __builtin_ia32_cvtps2pd(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2pd((__v4si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi(__a);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector(__u, __u, 1, 0);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  struct __mm_store1_pd_struct {
+    double __u[2];
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
+  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  __builtin_ia32_storeupd(__dp, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a + (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a + (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a + (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a * (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a - (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a - (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a - (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return __a ^ __b;
+}
+
+#define _mm_slli_si128(a, imm) __extension__ ({                         \
+  (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
+                                   (__v16qi)(__m128i)(a),               \
+                                   ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
+
+#define _mm_bslli_si128(a, imm) \
+  _mm_slli_si128((a), (imm))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+#define _mm_srli_si128(a, imm) __extension__ ({                          \
+  (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
+                                   (__v16qi)_mm_setzero_si128(),         \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
+
+#define _mm_bsrli_si128(a, imm) \
+  _mm_srli_si128((a), (imm))
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128(__a, __count);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned. */
+  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64(__a);
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq(__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq(__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64x(long long q1, long long q0)
+{
+  return (__m128i){ q0, q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64(__m64 q1, __m64 q0)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi64(__m64 q0, __m64 q1)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi32(int i0, int i1, int i2, int i3)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntpd(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_ia32_movntdq(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_clflush(void const *__p)
+{
+  __builtin_ia32_clflush(__p);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_lfence(void)
+{
+  __builtin_ia32_lfence();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_mfence(void)
+{
+  __builtin_ia32_mfence();
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
+                                   (__v4si)_mm_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_set1_epi16(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7); })
+
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd(__a);
+}
+
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
+                          (i) & 1, (((i) & 2) >> 1) + 2); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_pause(void)
+{
+  __asm__ volatile ("pause");
+}
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __SSE2__ */
+
+#endif /* __EMMINTRIN_H */

diff --git a/23.0.3/clang-include/f16cintrin.h b/23.0.3/clang-include/f16cintrin.h
new file mode 100644
index 0000000..f3614c0
--- /dev/null
+++ b/23.0.3/clang-include/f16cintrin.h

@@ -0,0 +1,58 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __F16C__
+# error "F16C instruction is not enabled"
+#endif /* __F16C__ */
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+  __m128 __a = (a); \
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__a, (imm)); })
+
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+  __m256 __a = (a); \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (imm)); })
+
+static __inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#endif /* __F16CINTRIN_H */

diff --git a/23.0.3/clang-include/float.h b/23.0.3/clang-include/float.h
new file mode 100644
index 0000000..238cf76
--- /dev/null
+++ b/23.0.3/clang-include/float.h

@@ -0,0 +1,124 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ */
+#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
+    __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  undef DECIMAL_DIG
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#define DECIMAL_DIG __DECIMAL_DIG__
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+#endif /* __FLOAT_H */

diff --git a/23.0.3/clang-include/fma4intrin.h b/23.0.3/clang-include/fma4intrin.h
new file mode 100644
index 0000000..c30920d
--- /dev/null
+++ b/23.0.3/clang-include/fma4intrin.h

@@ -0,0 +1,231 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#ifndef __FMA4__
+# error "FMA4 instruction set is not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#endif /* __FMA4__ */
+
+#endif /* __FMA4INTRIN_H */

diff --git a/23.0.3/clang-include/fmaintrin.h b/23.0.3/clang-include/fmaintrin.h
new file mode 100644
index 0000000..6bfd5a8
--- /dev/null
+++ b/23.0.3/clang-include/fmaintrin.h

@@ -0,0 +1,229 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+#ifndef __FMA__
+# error "FMA instruction set is not enabled"
+#else
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#endif /* __FMA__ */
+
+#endif /* __FMAINTRIN_H */

diff --git a/23.0.3/clang-include/htmintrin.h b/23.0.3/clang-include/htmintrin.h
new file mode 100644
index 0000000..0088c7c
--- /dev/null
+++ b/23.0.3/clang-include/htmintrin.h

@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED        0x1
+#define _HTM_TRANSACTIONAL    0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin  */
+#define _HTM_TBEGIN_STARTED       0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT     2
+#define _HTM_TBEGIN_PERSISTENT    3
+
+/* The abort codes below this threshold are reserved for machine use.  */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+   of Operation chapter 5-91.  */
+
+struct __htm_tdb {
+  unsigned char format;                /*   0 */
+  unsigned char flags;
+  unsigned char reserved1[4];
+  unsigned short nesting_depth;
+  unsigned long long abort_code;       /*   8 */
+  unsigned long long conflict_token;   /*  16 */
+  unsigned long long atia;             /*  24 */
+  unsigned char eaid;                  /*  32 */
+  unsigned char dxc;
+  unsigned char reserved2[2];
+  unsigned int program_int_id;
+  unsigned long long exception_id;     /*  40 */
+  unsigned long long bea;              /*  48 */
+  unsigned char reserved3[72];         /*  56 */
+  unsigned long long gprs[16];         /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure.  */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *tdb, int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_null(retry) : \
+   __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_nofloat_null(retry) : \
+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */

diff --git a/23.0.3/clang-include/htmxlintrin.h b/23.0.3/clang-include/htmxlintrin.h
new file mode 100644
index 0000000..30f524d
--- /dev/null
+++ b/23.0.3/clang-include/htmxlintrin.h

@@ -0,0 +1,363 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) \
+  ((texasr_t *)((TM_BUF)+0))
+#define _TEXASRU_PTR(TM_BUF) \
+  ((texasru_t *)((TM_BUF)+0))
+#define _TEXASRL_PTR(TM_BUF) \
+  ((texasrl_t *)((TM_BUF)+4))
+#define _TFIAR_PTR(TM_BUF) \
+  ((tfiar_t *)((TM_BUF)+8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully 
+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+   below.  */
+#define _HTM_TBEGIN_STARTED     1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const TM_buff)
+{
+  *_TEXASRL_PTR (TM_buff) = 0;
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+  *_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
+#else
+  *_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
+#endif
+  *_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+  if (__builtin_expect (__builtin_tend (0), 1))
+    return 1;
+  return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+  __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const code)
+{
+  __builtin_tabort (code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+  __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+  __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+
+  *code = _TEXASRU_FAILURE_CODE (texasru);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const TM_buff)
+{
+  texasrl_t texasrl;
+
+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+    {
+      texasrl = *_TEXASRL_PTR (TM_buff);
+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+        texasrl = 0;
+    }
+  else
+    texasrl = (texasrl_t) __builtin_get_texasr ();
+
+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
+     14 (Translation Invalidation Conflict).  */
+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const TM_buff)
+{
+  return *_TFIAR_PTR (TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const TM_buff)
+{
+  return *_TEXASR_PTR (TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+   the IBM XL compiler.  For documentation please see the "z/OS XL
+   C/C++ Programming Guide" publically available on the web.  */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+  return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const tdb)
+{
+  return __builtin_tbegin_nofloat (tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+  return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const code)
+{
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const addr, long long const value)
+{
+  __builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const tdb_ptr)
+{
+  int depth = __builtin_tx_nesting_depth ();
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (depth != 0)
+    return depth;
+
+  if (tdb->format != 1)
+    return 0;
+  return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+    {
+      *code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      return 1;
+    }
+  return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
+	      || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 7 /* fetch overflow */
+	      || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 9 /* fetch conflict */
+	      || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const result)
+{
+  return result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H  */

diff --git a/23.0.3/clang-include/ia32intrin.h b/23.0.3/clang-include/ia32intrin.h
new file mode 100644
index 0000000..5adf3f1
--- /dev/null
+++ b/23.0.3/clang-include/ia32intrin.h

@@ -0,0 +1,101 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned long long __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popq %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __asm__ __volatile__ ("pushq %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned int __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popl %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __asm__ __volatile__ ("pushl %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtsc */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtsc(void) {
+  return __builtin_ia32_rdtsc();
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#endif /* __IA32INTRIN_H */

diff --git a/23.0.3/clang-include/immintrin.h b/23.0.3/clang-include/immintrin.h
new file mode 100644
index 0000000..2400fea
--- /dev/null
+++ b/23.0.3/clang-include/immintrin.h

@@ -0,0 +1,194 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#ifdef __MMX__
+#include <mmintrin.h>
+#endif
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+
+#ifdef __SSSE3__
+#include <tmmintrin.h>
+#endif
+
+#if defined (__SSE4_2__) || defined (__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+#if defined (__AES__) || defined (__PCLMUL__)
+#include <wmmintrin.h>
+#endif
+
+#ifdef __AVX__
+#include <avxintrin.h>
+#endif
+
+#ifdef __AVX2__
+#include <avx2intrin.h>
+#endif
+
+#ifdef __BMI__
+#include <bmiintrin.h>
+#endif
+
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
+#ifdef __LZCNT__
+#include <lzcntintrin.h>
+#endif
+
+#ifdef __FMA__
+#include <fmaintrin.h>
+#endif
+
+#ifdef __AVX512F__
+#include <avx512fintrin.h>
+#endif
+
+#ifdef __AVX512VL__
+#include <avx512vlintrin.h>
+#endif
+
+#ifdef __AVX512BW__
+#include <avx512bwintrin.h>
+#endif
+
+#if defined (__AVX512VL__) && defined (__AVX512BW__)
+#include <avx512vlbwintrin.h>
+#endif
+
+#ifdef __AVX512ER__
+#include <avx512erintrin.h>
+#endif
+
+#ifdef __RDRND__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+#ifdef __FSGSBASE__
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_writefsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_writefsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_writegsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_writegsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrgsbase64(__V);
+}
+#endif
+#endif /* __FSGSBASE__ */
+
+#ifdef __RTM__
+#include <rtmintrin.h>
+#endif
+
+/* FIXME: check __HLE__ as well when HLE is supported. */
+#if defined (__RTM__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_xtest(void)
+{
+  return __builtin_ia32_xtest();
+}
+#endif
+
+#ifdef __SHA__
+#include <shaintrin.h>
+#endif
+
+/* Some intrinsics inside adxintrin.h are available only if __ADX__ defined,
+ * whereas others are also available if __ADX__ undefined */
+#include <adxintrin.h>
+
+#endif /* __IMMINTRIN_H */

diff --git a/23.0.3/clang-include/iso646.h b/23.0.3/clang-include/iso646.h
new file mode 100644
index 0000000..dca13c5
--- /dev/null
+++ b/23.0.3/clang-include/iso646.h

@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */

diff --git a/23.0.3/clang-include/limits.h b/23.0.3/clang-include/limits.h
new file mode 100644
index 0000000..f04187c
--- /dev/null
+++ b/23.0.3/clang-include/limits.h

@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */

diff --git a/23.0.3/clang-include/lzcntintrin.h b/23.0.3/clang-include/lzcntintrin.h
new file mode 100644
index 0000000..35d6659
--- /dev/null
+++ b/23.0.3/clang-include/lzcntintrin.h

@@ -0,0 +1,67 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNT__
+# error "LZCNT instruction is not enabled"
+#endif /* __LZCNT__ */
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__lzcnt16(unsigned short __X)
+{
+  return __X ? __builtin_clzs(__X) : 16;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__lzcnt32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_lzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__lzcnt64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_lzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+#endif
+
+#endif /* __LZCNTINTRIN_H */

diff --git a/23.0.3/clang-include/mm3dnow.h b/23.0.3/clang-include/mm3dnow.h
new file mode 100644
index 0000000..5242d99
--- /dev/null
+++ b/23.0.3/clang-include/mm3dnow.h

@@ -0,0 +1,162 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_femms() {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#endif

diff --git a/23.0.3/clang-include/mm_malloc.h b/23.0.3/clang-include/mm_malloc.h
new file mode 100644
index 0000000..305afd3
--- /dev/null
+++ b/23.0.3/clang-include/mm_malloc.h

@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */

diff --git a/23.0.3/clang-include/mmintrin.h b/23.0.3/clang-include/mmintrin.h
new file mode 100644
index 0000000..986870a
--- /dev/null
+++ b/23.0.3/clang-include/mmintrin.h

@@ -0,0 +1,503 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+#ifndef __MMX__
+#error "MMX instruction set not enabled"
+#else
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_pi16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq(__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);       
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi(__m, __count);    
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor(__m1, __m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMX__ */
+
+#endif /* __MMINTRIN_H */
+

diff --git a/23.0.3/clang-include/module.modulemap b/23.0.3/clang-include/module.modulemap
new file mode 100644
index 0000000..bb2ca95
--- /dev/null
+++ b/23.0.3/clang-include/module.modulemap

@@ -0,0 +1,189 @@
+module _Builtin_intrinsics [system] [extern_c] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module acle {
+      header "arm_acle.h"
+      export *
+    }
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    header "x86intrin.h"
+
+    explicit module mm_malloc {
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      requires x86
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      requires mmx
+      header "mmintrin.h"
+    }
+
+    explicit module f16c {
+      requires f16c
+      header "f16cintrin.h"
+    }
+
+    explicit module sse {
+      requires sse
+      export mmx
+      export * // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      requires sse2
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      requires sse3
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      requires ssse3
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      requires sse41
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      requires sse42
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      requires sse4a
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module avx {
+      requires avx
+      export sse4_2
+      header "avxintrin.h"
+    }
+
+    explicit module avx2 {
+      requires avx2
+      export avx
+      header "avx2intrin.h"
+    }
+
+    explicit module avx512f {
+      requires avx512f
+      export avx2
+      header "avx512fintrin.h"
+    }
+
+    explicit module avx512er {
+      requires avx512er
+      header "avx512erintrin.h"
+    }
+
+    explicit module bmi {
+      requires bmi
+      header "bmiintrin.h"
+    }
+
+    explicit module bmi2 {
+      requires bmi2
+      header "bmi2intrin.h"
+    }
+
+    explicit module fma {
+      requires fma
+      header "fmaintrin.h"
+    }
+
+    explicit module fma4 {
+      requires fma4
+      export sse3
+      header "fma4intrin.h"
+    }
+
+    explicit module lzcnt {
+      requires lzcnt
+      header "lzcntintrin.h"
+    }
+
+    explicit module popcnt {
+      requires popcnt
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      requires mm3dnow
+      header "mm3dnow.h"
+    }
+
+    explicit module xop {
+      requires xop
+      export fma4
+      header "xopintrin.h"
+    }
+
+    explicit module aes_pclmul {
+      requires aes, pclmul
+      header "wmmintrin.h"
+    }
+
+    explicit module aes {
+      requires aes
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      requires pclmul
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+
+  explicit module systemz {
+    requires systemz
+    export *
+
+    header "s390intrin.h"
+
+    explicit module htm {
+      requires htm
+      header "htmintrin.h"
+      header "htmxlintrin.h"
+    }
+  }
+}
+
+module _Builtin_stddef_max_align_t [system] [extern_c] {
+  header "__stddef_max_align_t.h"
+}

diff --git a/23.0.3/clang-include/nmmintrin.h b/23.0.3/clang-include/nmmintrin.h
new file mode 100644
index 0000000..f12622d
--- /dev/null
+++ b/23.0.3/clang-include/nmmintrin.h

@@ -0,0 +1,35 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+#ifndef __SSE4_2__
+#error "SSE4.2 instruction set not enabled"
+#else
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* __SSE4_2__ */
+#endif /* _NMMINTRIN_H */

diff --git a/23.0.3/clang-include/pmmintrin.h b/23.0.3/clang-include/pmmintrin.h
new file mode 100644
index 0000000..6f1fc32
--- /dev/null
+++ b/23.0.3/clang-include/pmmintrin.h

@@ -0,0 +1,117 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#ifndef __SSE3__
+#error "SSE3 instruction set not enabled"
+#else
+
+#include <emmintrin.h>
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd(__a, __b);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd(__a, __b);
+}
+
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#endif /* __SSE3__ */
+
+#endif /* __PMMINTRIN_H */

diff --git a/23.0.3/clang-include/popcntintrin.h b/23.0.3/clang-include/popcntintrin.h
new file mode 100644
index 0000000..d439daa
--- /dev/null
+++ b/23.0.3/clang-include/popcntintrin.h

@@ -0,0 +1,45 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __POPCNT__
+#error "POPCNT instruction set not enabled"
+#endif
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#endif /* _POPCNTINTRIN_H */

diff --git a/23.0.3/clang-include/prfchwintrin.h b/23.0.3/clang-include/prfchwintrin.h
new file mode 100644
index 0000000..9825bd8
--- /dev/null
+++ b/23.0.3/clang-include/prfchwintrin.h

@@ -0,0 +1,39 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */

diff --git a/23.0.3/clang-include/rdseedintrin.h b/23.0.3/clang-include/rdseedintrin.h
new file mode 100644
index 0000000..0fef1fa
--- /dev/null
+++ b/23.0.3/clang-include/rdseedintrin.h

@@ -0,0 +1,52 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+#ifdef __RDSEED__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+#endif /* __RDSEED__ */
+#endif /* __RDSEEDINTRIN_H */

diff --git a/23.0.3/clang-include/rtmintrin.h b/23.0.3/clang-include/rtmintrin.h
new file mode 100644
index 0000000..26149ca
--- /dev/null
+++ b/23.0.3/clang-include/rtmintrin.h

@@ -0,0 +1,54 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#endif /* __RTMINTRIN_H */

diff --git a/23.0.3/clang-include/s390intrin.h b/23.0.3/clang-include/s390intrin.h
new file mode 100644
index 0000000..b209895
--- /dev/null
+++ b/23.0.3/clang-include/s390intrin.h

@@ -0,0 +1,35 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/

diff --git a/23.0.3/clang-include/shaintrin.h b/23.0.3/clang-include/shaintrin.h
new file mode 100644
index 0000000..391a4bb
--- /dev/null
+++ b/23.0.3/clang-include/shaintrin.h

@@ -0,0 +1,74 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+#if !defined (__SHA__)
+#  error "SHA instructions not enabled"
+#endif
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((V1), (V2), (M)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#endif /* __SHAINTRIN_H */

diff --git a/23.0.3/clang-include/smmintrin.h b/23.0.3/clang-include/smmintrin.h
new file mode 100644
index 0000000..6e35734
--- /dev/null
+++ b/23.0.3/clang-include/smmintrin.h

@@ -0,0 +1,482 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#ifndef __SSE4_1__
+#error "SSE4.1 instruction set not enabled"
+#else
+
+#include <tmmintrin.h>
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  __m128 __X = (X); \
+  (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  __m128d __X = (X); \
+  (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  __m128d __V1 = (V1); \
+  __m128d __V2 = (V2); \
+  (__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  __m128 __V1 = (V1); \
+  __m128 __V2 = (V2); \
+  (__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  __m128i __V1 = (V1); \
+  __m128i __V2 = (V2); \
+  (__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_stream_load_si128 (__m128i *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+                                                    
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+                                           
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+                                             
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
+                                                   __a[(N) & 15] = (I);             \
+                                                   __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
+                                                    __a[(N) & 3] = (I);           \
+                                                    __a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                    __a[(N) & 1] = (I);           \
+                                                    __a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
+                                                 (int)(unsigned char) \
+                                                     __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
+                                                  __a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); 
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  __m128i __X = (X); \
+  __m128i __Y = (Y); \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+#ifdef __SSE4_2__
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
+#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
+     
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+     __builtin_ia32_pcmpistria128((A), (B), (M))
+#define _mm_cmpistrc(A, B, M) \
+     __builtin_ia32_pcmpistric128((A), (B), (M))
+#define _mm_cmpistro(A, B, M) \
+     __builtin_ia32_pcmpistrio128((A), (B), (M))
+#define _mm_cmpistrs(A, B, M) \
+     __builtin_ia32_pcmpistris128((A), (B), (M))
+#define _mm_cmpistrz(A, B, M) \
+     __builtin_ia32_pcmpistriz128((A), (B), (M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+     __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* __SSE4_2__ */
+#endif /* __SSE4_1__ */
+
+#endif /* _SMMINTRIN_H */

diff --git a/23.0.3/clang-include/stdalign.h b/23.0.3/clang-include/stdalign.h
new file mode 100644
index 0000000..3738d12
--- /dev/null
+++ b/23.0.3/clang-include/stdalign.h

@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */

diff --git a/23.0.3/clang-include/stdarg.h b/23.0.3/clang-include/stdarg.h
new file mode 100644
index 0000000..a57e183
--- /dev/null
+++ b/23.0.3/clang-include/stdarg.h

@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */

diff --git a/23.0.3/clang-include/stdatomic.h b/23.0.3/clang-include/stdatomic.h
new file mode 100644
index 0000000..e037987
--- /dev/null
+++ b/23.0.3/clang-include/stdatomic.h

@@ -0,0 +1,190 @@
+/*===---- stdatomic.h - Standard header for atomic types and operations -----===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_STDATOMIC_H
+#define __CLANG_STDATOMIC_H
+
+/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
+ * example, already has a Clang-compatible stdatomic.h header.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
+# include_next <stdatomic.h>
+#else
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 7.17.1 Introduction */
+
+#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_T_LOCK_FREE    __GCC_ATOMIC_SHORT_T_LOCK_FREE
+#define ATOMIC_INT_T_LOCK_FREE      __GCC_ATOMIC_INT_T_LOCK_FREE
+#define ATOMIC_LONG_T_LOCK_FREE     __GCC_ATOMIC_LONG_T_LOCK_FREE
+#define ATOMIC_LLONG_T_LOCK_FREE    __GCC_ATOMIC_LLONG_T_LOCK_FREE
+#define ATOMIC_POINTER_T_LOCK_FREE  __GCC_ATOMIC_POINTER_T_LOCK_FREE
+
+/* 7.17.2 Initialization */
+
+#define ATOMIC_VAR_INIT(value) (value)
+#define atomic_init __c11_atomic_init
+
+/* 7.17.3 Order and consistency */
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#define kill_dependency(y) (y)
+
+/* 7.17.4 Fences */
+
+/* These should be provided by the libc implementation. */
+void atomic_thread_fence(memory_order);
+void atomic_signal_fence(memory_order);
+
+#define atomic_thread_fence(order) __c11_atomic_thread_fence(order)
+#define atomic_signal_fence(order) __c11_atomic_signal_fence(order)
+
+/* 7.17.5 Lock-free property */
+
+#define atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
+
+/* 7.17.6 Atomic integer types */
+
+#ifdef __cplusplus
+typedef _Atomic(bool)               atomic_bool;
+#else
+typedef _Atomic(_Bool)              atomic_bool;
+#endif
+typedef _Atomic(char)               atomic_char;
+typedef _Atomic(signed char)        atomic_schar;
+typedef _Atomic(unsigned char)      atomic_uchar;
+typedef _Atomic(short)              atomic_short;
+typedef _Atomic(unsigned short)     atomic_ushort;
+typedef _Atomic(int)                atomic_int;
+typedef _Atomic(unsigned int)       atomic_uint;
+typedef _Atomic(long)               atomic_long;
+typedef _Atomic(unsigned long)      atomic_ulong;
+typedef _Atomic(long long)          atomic_llong;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(uint_least16_t)     atomic_char16_t;
+typedef _Atomic(uint_least32_t)     atomic_char32_t;
+typedef _Atomic(wchar_t)            atomic_wchar_t;
+typedef _Atomic(int_least8_t)       atomic_int_least8_t;
+typedef _Atomic(uint_least8_t)      atomic_uint_least8_t;
+typedef _Atomic(int_least16_t)      atomic_int_least16_t;
+typedef _Atomic(uint_least16_t)     atomic_uint_least16_t;
+typedef _Atomic(int_least32_t)      atomic_int_least32_t;
+typedef _Atomic(uint_least32_t)     atomic_uint_least32_t;
+typedef _Atomic(int_least64_t)      atomic_int_least64_t;
+typedef _Atomic(uint_least64_t)     atomic_uint_least64_t;
+typedef _Atomic(int_fast8_t)        atomic_int_fast8_t;
+typedef _Atomic(uint_fast8_t)       atomic_uint_fast8_t;
+typedef _Atomic(int_fast16_t)       atomic_int_fast16_t;
+typedef _Atomic(uint_fast16_t)      atomic_uint_fast16_t;
+typedef _Atomic(int_fast32_t)       atomic_int_fast32_t;
+typedef _Atomic(uint_fast32_t)      atomic_uint_fast32_t;
+typedef _Atomic(int_fast64_t)       atomic_int_fast64_t;
+typedef _Atomic(uint_fast64_t)      atomic_uint_fast64_t;
+typedef _Atomic(intptr_t)           atomic_intptr_t;
+typedef _Atomic(uintptr_t)          atomic_uintptr_t;
+typedef _Atomic(size_t)             atomic_size_t;
+typedef _Atomic(ptrdiff_t)          atomic_ptrdiff_t;
+typedef _Atomic(intmax_t)           atomic_intmax_t;
+typedef _Atomic(uintmax_t)          atomic_uintmax_t;
+
+/* 7.17.7 Operations on atomic types */
+
+#define atomic_store(object, desired) __c11_atomic_store(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_store_explicit __c11_atomic_store
+
+#define atomic_load(object) __c11_atomic_load(object, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit __c11_atomic_load
+
+#define atomic_exchange(object, desired) __c11_atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_exchange_explicit __c11_atomic_exchange
+
+#define atomic_compare_exchange_strong(object, expected, desired) __c11_atomic_compare_exchange_strong(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+
+#define atomic_compare_exchange_weak(object, expected, desired) __c11_atomic_compare_exchange_weak(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+
+#define atomic_fetch_sub(object, operand) __c11_atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+
+#define atomic_fetch_or(object, operand) __c11_atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+
+#define atomic_fetch_xor(object, operand) __c11_atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+
+#define atomic_fetch_and(object, operand) __c11_atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+
+/* 7.17.8 Atomic flag type and operations */
+
+typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
+
+#define ATOMIC_FLAG_INIT { 0 }
+
+/* These should be provided by the libc implementation. */
+#ifdef __cplusplus
+bool atomic_flag_test_and_set(volatile atomic_flag *);
+bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#else
+_Bool atomic_flag_test_and_set(volatile atomic_flag *);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#endif
+void atomic_flag_clear(volatile atomic_flag *);
+void atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
+
+#define atomic_flag_test_and_set(object) __c11_atomic_exchange(&(object)->_Value, 1, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set_explicit(object, order) __c11_atomic_exchange(&(object)->_Value, 1, order)
+
+#define atomic_flag_clear(object) __c11_atomic_store(&(object)->_Value, 0, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear_explicit(object, order) __c11_atomic_store(&(object)->_Value, 0, order)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDATOMIC_H */
+

diff --git a/23.0.3/clang-include/stdbool.h b/23.0.3/clang-include/stdbool.h
new file mode 100644
index 0000000..0467893
--- /dev/null
+++ b/23.0.3/clang-include/stdbool.h

@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */

diff --git a/23.0.3/clang-include/stddef.h b/23.0.3/clang-include/stddef.h
new file mode 100644
index 0000000..7354996
--- /dev/null
+++ b/23.0.3/clang-include/stddef.h

@@ -0,0 +1,137 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+/* Always define miscellaneous pieces when modules are available. */
+#if !__has_feature(modules)
+#define __STDDEF_H
+#endif
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+#define __need_STDDEF_H_misc
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__need_STDDEF_H_misc)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__need_STDDEF_H_misc) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__need_STDDEF_H_misc)
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#include "__stddef_max_align_t.h"
+#endif
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#undef __need_STDDEF_H_misc
+#endif  /* defined(__need_STDDEF_H_misc) */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif

diff --git a/23.0.3/clang-include/stdint.h b/23.0.3/clang-include/stdint.h
new file mode 100644
index 0000000..0303db9
--- /dev/null
+++ b/23.0.3/clang-include/stdint.h

@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and 
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types. 
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).  
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef __UINT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef __INT56_TYPE__ int56_t;
+typedef __UINT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef __INT48_TYPE__ int48_t;
+typedef __UINT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef __INT40_TYPE__ int40_t;
+typedef __UINT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef __UINT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef __INT24_TYPE__ int24_t;
+typedef __UINT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef __UINT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef __UINT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined  
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined 
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are 
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types. 
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */

diff --git a/23.0.3/clang-include/stdnoreturn.h b/23.0.3/clang-include/stdnoreturn.h
new file mode 100644
index 0000000..a7a301d
--- /dev/null
+++ b/23.0.3/clang-include/stdnoreturn.h

@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */

diff --git a/23.0.3/clang-include/tbmintrin.h b/23.0.3/clang-include/tbmintrin.h
new file mode 100644
index 0000000..f95e34f
--- /dev/null
+++ b/23.0.3/clang-include/tbmintrin.h

@@ -0,0 +1,158 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TBM__
+#error "TBM instruction set is not enabled"
+#endif
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+#define __bextri_u32(a, b) (__builtin_ia32_bextri_u32((a), (b)))
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcfill_u32(unsigned int a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blci_u32(unsigned int a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcic_u32(unsigned int a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcmsk_u32(unsigned int a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blcs_u32(unsigned int a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsfill_u32(unsigned int a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__blsic_u32(unsigned int a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__t1mskc_u32(unsigned int a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__tzmsk_u32(unsigned int a)
+{
+  return ~a & (a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) (__builtin_ia32_bextri_u64((a), (int)(b)))
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcfill_u64(unsigned long long a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blci_u64(unsigned long long a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcic_u64(unsigned long long a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcmsk_u64(unsigned long long a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blcs_u64(unsigned long long a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blsfill_u64(unsigned long long a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__blsic_u64(unsigned long long a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__t1mskc_u64(unsigned long long a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__,
+                                                    __nodebug__))
+__tzmsk_u64(unsigned long long a)
+{
+  return ~a & (a - 1);
+}
+#endif
+
+#endif /* __TBMINTRIN_H */

diff --git a/23.0.3/clang-include/tgmath.h b/23.0.3/clang-include/tgmath.h
new file mode 100644
index 0000000..a48e267
--- /dev/null
+++ b/23.0.3/clang-include/tgmath.h

@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y) 
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */

diff --git a/23.0.3/clang-include/tmmintrin.h b/23.0.3/clang-include/tmmintrin.h
new file mode 100644
index 0000000..4238f5b
--- /dev/null
+++ b/23.0.3/clang-include/tmmintrin.h

@@ -0,0 +1,225 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#ifndef __SSSE3__
+#error "SSSE3 instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  __m128i __a = (a); \
+  __m128i __b = (b); \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); })
+
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  __m64 __a = (a); \
+  __m64 __b = (b); \
+  (__m64)__builtin_ia32_palignr((__v8qi)__a, (__v8qi)__b, (n)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#endif /* __SSSE3__ */
+
+#endif /* __TMMINTRIN_H */

diff --git a/23.0.3/clang-include/unwind.h b/23.0.3/clang-include/unwind.h
new file mode 100644
index 0000000..303d792
--- /dev/null
+++ b/23.0.3/clang-include/unwind.h

@@ -0,0 +1,282 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if defined(__APPLE__) && __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+_Unwind_Word _Unwind_GetBSP(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info(const void *, void *) __attribute__((__unavailable__));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_table(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */

diff --git a/23.0.3/clang-include/vadefs.h b/23.0.3/clang-include/vadefs.h
new file mode 100644
index 0000000..7fe9a74
--- /dev/null
+++ b/23.0.3/clang-include/vadefs.h

@@ -0,0 +1,65 @@
+/* ===-------- vadefs.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we are aiming for MSVC compatibility. */
+#ifndef _MSC_VER
+#include_next <vadefs.h>
+#else
+
+#ifndef __clang_vadefs_h
+#define __clang_vadefs_h
+
+#include_next <vadefs.h>
+
+/* Override macros from vadefs.h with definitions that work with Clang. */
+#ifdef _crt_va_start
+#undef _crt_va_start
+#define _crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef _crt_va_end
+#undef _crt_va_end
+#define _crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef _crt_va_arg
+#undef _crt_va_arg
+#define _crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+/* VS 2015 switched to double underscore names, which is an improvement, but now
+ * we have to intercept those names too.
+ */
+#ifdef __crt_va_start
+#undef __crt_va_start
+#define __crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef __crt_va_end
+#undef __crt_va_end
+#define __crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef __crt_va_arg
+#undef __crt_va_arg
+#define __crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+#endif
+#endif

diff --git a/23.0.3/clang-include/varargs.h b/23.0.3/clang-include/varargs.h
new file mode 100644
index 0000000..b5477d0
--- /dev/null
+++ b/23.0.3/clang-include/varargs.h

@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif

diff --git a/23.0.3/clang-include/wmmintrin.h b/23.0.3/clang-include/wmmintrin.h
new file mode 100644
index 0000000..369e3c2
--- /dev/null
+++ b/23.0.3/clang-include/wmmintrin.h

@@ -0,0 +1,42 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#if !defined (__AES__) && !defined (__PCLMUL__)
+# error "AES/PCLMUL instructions not enabled"
+#else
+
+#ifdef __AES__
+#include <__wmmintrin_aes.h>
+#endif /* __AES__ */
+
+#ifdef __PCLMUL__
+#include <__wmmintrin_pclmul.h>
+#endif /* __PCLMUL__ */
+
+#endif /* __AES__ || __PCLMUL__ */
+#endif /* _WMMINTRIN_H */

diff --git a/23.0.3/clang-include/x86intrin.h b/23.0.3/clang-include/x86intrin.h
new file mode 100644
index 0000000..21a43da
--- /dev/null
+++ b/23.0.3/clang-include/x86intrin.h

@@ -0,0 +1,81 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#ifdef __3dNOW__
+#include <mm3dnow.h>
+#endif
+
+#ifdef __BMI__
+#include <bmiintrin.h>
+#endif
+
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
+#ifdef __LZCNT__
+#include <lzcntintrin.h>
+#endif
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#ifdef __RDSEED__
+#include <rdseedintrin.h>
+#endif
+
+#ifdef __PRFCHW__
+#include <prfchwintrin.h>
+#endif
+
+#ifdef __SSE4A__
+#include <ammintrin.h>
+#endif
+
+#ifdef __FMA4__
+#include <fma4intrin.h>
+#endif
+
+#ifdef __XOP__
+#include <xopintrin.h>
+#endif
+
+#ifdef __TBM__
+#include <tbmintrin.h>
+#endif
+
+#ifdef __F16C__
+#include <f16cintrin.h>
+#endif
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */

diff --git a/23.0.3/clang-include/xmmintrin.h b/23.0.3/clang-include/xmmintrin.h
new file mode 100644
index 0000000..d1afe81
--- /dev/null
+++ b/23.0.3/clang-include/xmmintrin.h

@@ -0,0 +1,1003 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+ 
+#ifndef __SSE__
+#error "SSE instruction set not enabled"
+#else
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpless(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpltss(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps(__b, __a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpless(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps(__b, __a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnless(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpnltss(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps(__b, __a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpnless(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps(__b, __a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordps(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordss(__a, __b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordps(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq(__a, __b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64(__a);
+}
+
+#endif
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si64(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi(__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_storeups(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
+  _mm_storeu_ps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps1(float *__p, __m128 __a)
+{
+    return _mm_store1_ps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128 *)__p = __a;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntps(__p, __a);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_sfence(void)
+{
+  __builtin_ia32_sfence();
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_pi16(__m64 __a, int __n)
+{
+  __v4hi __b = (__v4hi)__a;
+  return (unsigned short)__b[__n & 3];
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_insert_pi16(__m64 __a, int __d, int __n)
+{
+   __v4hi __b = (__v4hi)__a;
+   __b[__n & 3] = __d;
+   return (__m64)__b;
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  __m64 __a = (a); \
+  (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_getcsr(void)
+{
+  return __builtin_ia32_stmxcsr();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_setcsr(unsigned int __i)
+{
+  __builtin_ia32_ldmxcsr(__i);
+}
+
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  __m128 __a = (a); \
+  __m128 __b = (b); \
+  (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
+                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
+                                  (((mask) & 0x30) >> 4) + 4, \
+                                  (((mask) & 0xc0) >> 6) + 4); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+  
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+  
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+  
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+  
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+  
+  return _mm_packs_pi32(__b, __c);
+}
+
+static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+  
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+  
+  return _mm_packs_pi16(__b, __c);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps(__a);
+}
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#endif /* __SSE__ */
+
+#endif /* __XMMINTRIN_H */

diff --git a/23.0.3/clang-include/xopintrin.h b/23.0.3/clang-include/xopintrin.h
new file mode 100644
index 0000000..cc94ca0
--- /dev/null
+++ b/23.0.3/clang-include/xopintrin.h

@@ -0,0 +1,804 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#ifndef __XOP__
+# error "XOP instruction set is not enabled"
+#else
+
+#include <fma4intrin.h>
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  __m128i __A = (A); \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  __m128i __A = (A); \
+  __m128i __B = (B); \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  __m128d __X = (X); \
+  __m128d __Y = (Y); \
+  __m128i __C = (C); \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
+                                     (__v2di)__C, (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  __m256d __X = (X); \
+  __m256d __Y = (Y); \
+  __m256i __C = (C); \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
+                                        (__v4di)__C, (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  __m128 __X = (X); \
+  __m128 __Y = (Y); \
+  __m128i __C = (C); \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
+                                    (__v4si)__C, (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  __m256 __X = (X); \
+  __m256 __Y = (Y); \
+  __m256i __C = (C); \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
+                                       (__v8si)__C, (I)); })
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#endif /* __XOP__ */
+
+#endif /* __XOPINTRIN_H */

diff --git a/23.0.3/include/rs_allocation_data.rsh b/23.0.3/include/rs_allocation_data.rsh
new file mode 100644
index 0000000..69e9baf
--- /dev/null
+++ b/23.0.3/include/rs_allocation_data.rsh

@@ -0,0 +1,3357 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_allocation_data.rsh: Allocation Data Access Functions
+ *
+ * The functions below can be used to get and set the cells that comprise
+ * an allocation.
+ *
+ * - Individual cells are accessed using the rsGetElementAt* and
+ *   rsSetElementAt functions.
+ * - Multiple cells can be copied using the rsAllocationCopy* and
+ *   rsAllocationV* functions.
+ * - For getting values through a sampler, use rsSample.
+ *
+ * The rsGetElementAt and rsSetElement* functions are somewhat misnamed.
+ * They don't get or set elements, which are akin to data types; they get
+ * or set cells.  Think of them as rsGetCellAt and and rsSetCellAt.
+ */
+
+#ifndef RENDERSCRIPT_RS_ALLOCATION_DATA_RSH
+#define RENDERSCRIPT_RS_ALLOCATION_DATA_RSH
+
+/*
+ * rsAllocationCopy1DRange: Copy consecutive cells between allocations
+ *
+ * Copies the specified number of cells from one allocation to another.
+ *
+ * The two allocations must be different.  Using this function to copy whithin
+ * the same allocation yields undefined results.
+ *
+ * The function does not validate whether the offset plus count exceeds the size
+ * of either allocation.  Be careful!
+ *
+ * This function should only be called between 1D allocations.  Calling it
+ * on other allocations is undefined.
+ *
+ * Parameters:
+ *   dstAlloc: Allocation to copy cells into.
+ *   dstOff: Offset in the destination of the first cell to be copied into.
+ *   dstMip: Mip level in the destination allocation.  0 if mip mapping is not used.
+ *   count: Number of cells to be copied.
+ *   srcAlloc: Source allocation.
+ *   srcOff: Offset in the source of the first cell to be copied.
+ *   srcMip: Mip level in the source allocation.  0 if mip mapping is not used.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsAllocationCopy1DRange(rs_allocation dstAlloc, uint32_t dstOff, uint32_t dstMip, uint32_t count,
+                            rs_allocation srcAlloc, uint32_t srcOff, uint32_t srcMip);
+#endif
+
+/*
+ * rsAllocationCopy2DRange: Copy a rectangular region of cells between allocations
+ *
+ * Copies a rectangular region of cells from one allocation to another.
+ * (width * heigth) cells are copied.
+ *
+ * The two allocations must be different.  Using this function to copy whithin
+ * the same allocation yields undefined results.
+ *
+ * The function does not validate whether the the source or destination region
+ * exceeds the size of its respective allocation.  Be careful!
+ *
+ * This function should only be called between 2D allocations.  Calling it
+ * on other allocations is undefined.
+ *
+ * Parameters:
+ *   dstAlloc: Allocation to copy cells into.
+ *   dstXoff: X offset in the destination of the region to be set.
+ *   dstYoff: Y offset in the destination of the region to be set.
+ *   dstMip: Mip level in the destination allocation.  0 if mip mapping is not used.
+ *   dstFace: Cubemap face of the destination allocation.  Ignored for allocations that aren't cubemaps.
+ *   width: Width of the incoming region to update.
+ *   height: Height of the incoming region to update.
+ *   srcAlloc: Source allocation.
+ *   srcXoff: X offset in the source.
+ *   srcYoff: Y offset in the source.
+ *   srcMip: Mip level in the source allocation.  0 if mip mapping is not used.
+ *   srcFace: Cubemap face of the source allocation.  Ignored for allocations that aren't cubemaps.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsAllocationCopy2DRange(rs_allocation dstAlloc, uint32_t dstXoff, uint32_t dstYoff,
+                            uint32_t dstMip, rs_allocation_cubemap_face dstFace, uint32_t width,
+                            uint32_t height, rs_allocation srcAlloc, uint32_t srcXoff,
+                            uint32_t srcYoff, uint32_t srcMip, rs_allocation_cubemap_face srcFace);
+#endif
+
+/*
+ * rsAllocationVLoadX: Get a vector from an allocation of scalars
+ *
+ * This function returns a vector composed of successive cells of the allocation.
+ * It assumes that the allocation contains scalars.
+ *
+ * The "X" in the name indicates that successive values are extracted by
+ * increasing the X index.  There are currently no functions to get successive
+ * values incrementing other dimensions.  Use multiple calls to rsGetElementAt()
+ * instead.
+ *
+ * For example, when calling rsAllocationVLoadX_int4(a, 20, 30), an int4 composed
+ * of a[20, 30], a[21, 30], a[22, 30], and a[23, 30] is returned.
+ *
+ * When retrieving from a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * For efficiency, this function does not validate the inputs.  Trying to wrap
+ * the X index, exceeding the size of the allocation, or using indices incompatible
+ * with the dimensionality of the allocation yields undefined results.
+ *
+ * See also rsAllocationVStoreX().
+ *
+ * Parameters:
+ *   a: Allocation to get the data from.
+ *   x: X offset in the allocation of the first cell to be copied from.
+ *   y: Y offset in the allocation of the first cell to be copied from.
+ *   z: Z offset in the allocation of the first cell to be copied from.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsAllocationVStoreX: Store a vector into an allocation of scalars
+ *
+ * This function stores the entries of a vector into successive cells of an allocation.
+ * It assumes that the allocation contains scalars.
+ *
+ * The "X" in the name indicates that successive values are stored by increasing
+ * the X index.  There are currently no functions to store successive values
+ * incrementing other dimensions.  Use multiple calls to rsSetElementAt() instead.
+ *
+ * For example, when calling rsAllocationVStoreX_int3(a, v, 20, 30), v.x is stored
+ * at a[20, 30], v.y at a[21, 30], and v.z at a[22, 30].
+ *
+ * When storing into a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * For efficiency, this function does not validate the inputs.  Trying to wrap the
+ * X index, exceeding the size of the allocation, or using indexes incompatible
+ * with the dimensionality of the allocation yiels undefined results.
+ *
+ * See also rsAllocationVLoadX().
+ *
+ * Parameters:
+ *   a: Allocation to store the data into.
+ *   val: Value to be stored.
+ *   x: X offset in the allocation of the first cell to be copied into.
+ *   y: Y offset in the allocation of the first cell to be copied into.
+ *   z: Z offset in the allocation of the first cell to be copied into.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsGetElementAt: Return a cell from an allocation
+ *
+ * This function extracts a single cell from an allocation.
+ *
+ * When retrieving from a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * This function has two styles.  One returns the address of the value using a void*,
+ * the other returns the actual value, e.g. rsGetElementAt() vs. rsGetElementAt_int4().
+ * For primitive types, always use the latter as it is more efficient.
+ */
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x);
+
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y);
+
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x) {
+    return ((float *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x) {
+    return ((float2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x) {
+    return ((float3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x) {
+    return ((float4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x) {
+    return ((double *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x) {
+    return ((double2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x) {
+    return ((double3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x) {
+    return ((double4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x) {
+    return ((char *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x) {
+    return ((char2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x) {
+    return ((char3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x) {
+    return ((char4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x) {
+    return ((uchar *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x) {
+    return ((uchar2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x) {
+    return ((uchar3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x) {
+    return ((uchar4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x) {
+    return ((short *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x) {
+    return ((short2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x) {
+    return ((short3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x) {
+    return ((short4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x) {
+    return ((ushort *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x) {
+    return ((ushort2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x) {
+    return ((ushort3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x) {
+    return ((ushort4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x) {
+    return ((int *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x) {
+    return ((int2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x) {
+    return ((int3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x) {
+    return ((int4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x) {
+    return ((uint *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x) {
+    return ((uint2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x) {
+    return ((uint3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x) {
+    return ((uint4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x) {
+    return ((long *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x) {
+    return ((long2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x) {
+    return ((long3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x) {
+    return ((long4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x) {
+    return ((ulong *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x) {
+    return ((ulong2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x) {
+    return ((ulong3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x) {
+    return ((ulong4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_U: Get the U component of an allocation of YUVs
+ *
+ * Extracts the U component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_Y().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_V: Get the V component of an allocation of YUVs
+ *
+ * Extracts the V component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_Y().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_Y: Get the Y component of an allocation of YUVs
+ *
+ * Extracts the Y component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_U() and rsGetElementAtYuv_uchar_V().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_Y(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsSample: Sample a value from a texture allocation
+ *
+ * Fetches a value from a texture allocation in a way described by the sampler.
+ *
+ * If your allocation is 1D, use the variant with float for location.  For 2D,
+ * use the float2 variant.
+ *
+ * See android.renderscript.Sampler for more details.
+ *
+ * Parameters:
+ *   a: Allocation to sample from.
+ *   s: Sampler state.
+ *   location: Location to sample from.
+ *   lod: Mip level to sample from, for fractional values mip levels will be interpolated if RS_SAMPLER_LINEAR_MIP_LINEAR is used.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location, float lod);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location, float lod);
+#endif
+
+/*
+ * rsSetElementAt: Set a cell of an allocation
+ *
+ * This function stores a value into a single cell of an allocation.
+ *
+ * When storing into a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for
+ * the mono dimensional allocations.
+ *
+ * This function has two styles.  One passes the value to be stored using a void*,
+ * the other has the actual value as an argument, e.g. rsSetElementAt() vs.
+ * rsSetElementAt_int4().  For primitive types, always use the latter as it is
+ * more efficient.
+ *
+ * See also rsGetElementAt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#endif // RENDERSCRIPT_RS_ALLOCATION_DATA_RSH

diff --git a/23.0.3/include/rs_atomic.rsh b/23.0.3/include/rs_atomic.rsh
new file mode 100644
index 0000000..0ab26ca
--- /dev/null
+++ b/23.0.3/include/rs_atomic.rsh

@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_atomic.rsh: Atomic Update Functions
+ *
+ * To update values shared between multiple threads, use the functions below.
+ * They ensure that the values are atomically updated, i.e. that the memory
+ * reads, the updates, and the memory writes are done in the right order.
+ *
+ * These functions are slower than their non-atomic equivalents, so use
+ * them only when synchronization is needed.
+ *
+ * Note that in RenderScript, your code is likely to be running in separate
+ * threads even though you did not explicitely create them.  The RenderScript
+ * runtime will very often split the execution of one kernel across multiple
+ * threads.  Updating globals should be done with atomic functions.  If possible,
+ * modify your algorithm to avoid them altogether.
+ */
+
+#ifndef RENDERSCRIPT_RS_ATOMIC_RSH
+#define RENDERSCRIPT_RS_ATOMIC_RSH
+
+/*
+ * rsAtomicAdd: Thread-safe addition
+ *
+ * Atomicly adds a value to the value at addr, i.e. *addr += value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Amount to add.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicAnd: Thread-safe bitwise and
+ *
+ * Atomicly performs a bitwise and of two values, storing the result back at addr,
+ * i.e. *addr &= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to and with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicCas: Thread-safe compare and set
+ *
+ * If the value at addr matches compareValue then the newValue is written at addr,
+ * i.e. if (*addr == compareValue) { *addr = newValue; }.
+ *
+ * You can check that the value was written by checking that the value returned
+ * by rsAtomicCas() is compareValue.
+ *
+ * Parameters:
+ *   addr: Address of the value to compare and replace if the test passes.
+ *   compareValue: Value to test *addr against.
+ *   newValue: Value to write if the test passes.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicCas(volatile int32_t* addr, int32_t compareValue, int32_t newValue);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicCas(volatile uint32_t* addr, uint32_t compareValue, uint32_t newValue);
+#endif
+
+/*
+ * rsAtomicDec: Thread-safe decrement
+ *
+ * Atomicly subtracts one from the value at addr.  This is equivalent to rsAtomicSub(addr, 1).
+ *
+ * Parameters:
+ *   addr: Address of the value to decrement.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile int32_t* addr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile uint32_t* addr);
+#endif
+
+/*
+ * rsAtomicInc: Thread-safe increment
+ *
+ * Atomicly adds one to the value at addr.  This is equivalent to rsAtomicAdd(addr, 1).
+ *
+ * Parameters:
+ *   addr: Address of the value to increment.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile int32_t* addr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile uint32_t* addr);
+#endif
+
+/*
+ * rsAtomicMax: Thread-safe maximum
+ *
+ * Atomicly sets the value at addr to the maximum of *addr and value, i.e.
+ * *addr = max(*addr, value).
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Comparison value.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMax(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicMax(volatile int32_t* addr, int32_t value);
+#endif
+
+/*
+ * rsAtomicMin: Thread-safe minimum
+ *
+ * Atomicly sets the value at addr to the minimum of *addr and value, i.e.
+ * *addr = min(*addr, value).
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Comparison value.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMin(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicMin(volatile int32_t* addr, int32_t value);
+#endif
+
+/*
+ * rsAtomicOr: Thread-safe bitwise or
+ *
+ * Atomicly perform a bitwise or two values, storing the result at addr,
+ * i.e. *addr |= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to or with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicSub: Thread-safe subtraction
+ *
+ * Atomicly subtracts a value from the value at addr, i.e. *addr -= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Amount to subtract.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicXor: Thread-safe bitwise exclusive or
+ *
+ * Atomicly performs a bitwise xor of two values, storing the result at addr,
+ * i.e. *addr ^= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to xor with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#endif // RENDERSCRIPT_RS_ATOMIC_RSH

diff --git a/23.0.3/include/rs_convert.rsh b/23.0.3/include/rs_convert.rsh
new file mode 100644
index 0000000..66e47c6
--- /dev/null
+++ b/23.0.3/include/rs_convert.rsh

@@ -0,0 +1,1308 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_convert.rsh: Conversion Functions
+ *
+ * The functions below convert from a numerical vector type to another, of from one color
+ * representation to another.
+ */
+
+#ifndef RENDERSCRIPT_RS_CONVERT_RSH
+#define RENDERSCRIPT_RS_CONVERT_RSH
+
+/*
+ * convert: Convert numerical vectors
+ *
+ * Converts a vector from one numerical type to another.  The conversion are done entry per entry.
+ *
+ * E.g calling a = convert_short3(b); is equivalent to doing
+ * a.x = (short)b.x; a.y = (short)b.y; a.z = (short)b.z;.
+ *
+ * Converting floating point values to integer types truncates.
+ *
+ * Converting numbers too large to fit the destination type yields undefined results.
+ * For example, converting a float that contains 1.0e18 to a short is undefined.
+ * Use clamp() to avoid this.
+ */
+extern float2 __attribute__((const, overloadable))
+    convert_float2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(float4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(char2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(char3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(char4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(uchar2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(uchar3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(uchar4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(short2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(short3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(short4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(ushort2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(ushort3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(ushort4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(int2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(int3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(int4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(uint2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(uint3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(uint4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(float2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(float3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(float4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(char2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(char3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(char4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(uchar2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(uchar3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(uchar4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(short2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(short3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(short4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(ushort2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(ushort3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(ushort4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(int2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(int3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(int4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(uint2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(uint3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(uint4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(float2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(float3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(float4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(char2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(char3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(char4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(uchar2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(uchar3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(uchar4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(short2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(short3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(short4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(ushort2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(ushort3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(ushort4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(int2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(int3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(int4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(uint2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(uint3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(uint4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(float2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(float3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(float4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(char2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(char3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(char4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(uchar2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(uchar3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(uchar4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(short2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(short3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(short4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(ushort2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(ushort3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(ushort4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(int2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(int3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(int4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(uint2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(uint3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(uint4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(float2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(float3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(float4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(char2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(char3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(char4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(uchar2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(uchar3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(uchar4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(short2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(short3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(short4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(ushort2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(ushort3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(ushort4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(int2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(int3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(int4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(uint2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(uint3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(uint4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(float2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(float3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(float4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(char2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(char3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(char4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(uchar2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(uchar3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(uchar4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(short2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(short3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(short4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(ushort2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(ushort3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(ushort4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(int2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(int3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(int4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(uint2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(uint3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(uint4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(float2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(float3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(float4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(char2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(char3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(char4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(uchar2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(uchar3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(uchar4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(short2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(short3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(short4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(ushort2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(ushort3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(ushort4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(int2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(int3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(int4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(uint2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(uint3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(uint4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(uint4 v);
+#endif
+
+/*
+ * rsPackColorTo8888: Create a uchar4 RGBA from floats
+ *
+ * Packs three or four floating point RGBA values into a uchar4.
+ *
+ * The input values are typically between 0.0 and 1.0 inclusive.  For input values outside
+ * of this range, the resulting outputs will be clamped to be between 0 and 255.  As this
+ * clamping may be done after the input is multiplied by 255.f and converted to an integer,
+ * input numbers greater than INT_MAX/255.f or less than INT_MIN/255.f can result in
+ * undefined behavior.
+ *
+ * If the alpha component is not specified, it is assumed to be 1.0, i.e. the result will
+ * have an alpha set to 255.
+ *
+ * Parameters:
+ *   r: Red component.
+ *   g: Green component.
+ *   b: Blue component.
+ *   a: Alpha component.
+ *   color: Vector of 3 or 4 floats containing the R, G, B, and A values.
+ */
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float r, float g, float b);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float r, float g, float b, float a);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float3 color);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float4 color);
+
+/*
+ * rsUnpackColor8888: Create a float4 RGBA from uchar4
+ *
+ * Unpacks a uchar4 color to float4.  The resulting floats will be between 0.0 and 1.0 inclusive.
+ */
+extern float4 __attribute__((const))
+    rsUnpackColor8888(uchar4 c);
+
+/*
+ * rsYuvToRGBA: Convert a YUV value to RGBA
+ *
+ * Converts a color from a YUV representation to RGBA.
+ *
+ * We currently don't provide a function to do the reverse conversion.
+ *
+ * Parameters:
+ *   y: Luminance component.
+ *   u: U chrominance component.
+ *   v: V chrominance component.
+ */
+extern float4 __attribute__((const, overloadable))
+    rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+
+#endif // RENDERSCRIPT_RS_CONVERT_RSH

diff --git a/23.0.3/include/rs_core.rsh b/23.0.3/include/rs_core.rsh
new file mode 100644
index 0000000..128c689
--- /dev/null
+++ b/23.0.3/include/rs_core.rsh

@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_core.rsh: Overview
+ *
+ * RenderScript is a high-performance runtime that provides compute operations at the native level.
+ * RenderScript code is compiled on devices at runtime to allow platform-independence as well.
+ *
+ * This reference documentation describes the RenderScript runtime APIs, which you can utilize
+ * to write RenderScript code in C99. The RenderScript compute header files are automatically
+ * included for you.
+ *
+ * To use RenderScript, you need to utilize the RenderScript runtime APIs documented here as well
+ * as the Android framework APIs for RenderScript.  For documentation on the Android framework
+ * APIs, see the android.renderscript package reference.
+ *
+ * For more information on how to develop with RenderScript and how the runtime and Android
+ * framework APIs interact, see the RenderScript developer guide and the RenderScript samples.
+ */
+
+#ifndef RENDERSCRIPT_RS_CORE_RSH
+#define RENDERSCRIPT_RS_CORE_RSH
+
+#define RS_KERNEL __attribute__((kernel))
+
+#include "stdbool.h"
+
+#include "rs_value_types.rsh"
+#include "rs_object_types.rsh"
+
+#include "rs_allocation_data.rsh"
+#include "rs_atomic.rsh"
+#include "rs_convert.rsh"
+#include "rs_debug.rsh"
+#include "rs_for_each.rsh"
+#include "rs_io.rsh"
+#include "rs_math.rsh"
+#include "rs_matrix.rsh"
+#include "rs_object_info.rsh"
+#include "rs_quaternion.rsh"
+#include "rs_time.rsh"
+#include "rs_vector_math.rsh"
+
+#endif // RENDERSCRIPT_RS_CORE_RSH

diff --git a/23.0.3/include/rs_debug.rsh b/23.0.3/include/rs_debug.rsh
new file mode 100644
index 0000000..f75a4cf
--- /dev/null
+++ b/23.0.3/include/rs_debug.rsh

@@ -0,0 +1,249 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_debug.rsh: Debugging Functions
+ *
+ * The functions below are intended to be used during application developement.
+ * They should not be used in shipping applications.
+ */
+
+#ifndef RENDERSCRIPT_RS_DEBUG_RSH
+#define RENDERSCRIPT_RS_DEBUG_RSH
+
+#define RS_DEBUG(a) rsDebug(#a, a)
+#define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
+
+/*
+ * rsDebug: Log a message and values
+ *
+ * This function prints a message to the standard log, followed by the provided values.
+ *
+ * This function is intended for debugging only and should not be used in shipping
+ * applications.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong a);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double4 a);
+#endif
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float2 a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float3 a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float4 a);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort4 a);
+#endif
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b, float c);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b, float c, float d);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, unsigned long long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const void* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix4x4* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix3x3* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix2x2* a);
+
+#endif // RENDERSCRIPT_RS_DEBUG_RSH

diff --git a/23.0.3/include/rs_for_each.rsh b/23.0.3/include/rs_for_each.rsh
new file mode 100644
index 0000000..9507c0c
--- /dev/null
+++ b/23.0.3/include/rs_for_each.rsh

@@ -0,0 +1,384 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_for_each.rsh: Kernel Invocation Functions and Types
+ *
+ * The rsForEach() function can be used to invoke the root kernel of a script.
+ *
+ * The other functions are used to get the characteristics of the invocation of
+ * an executing kernel, like dimensions and current indexes.  These functions take
+ * a rs_kernel_context as argument.
+ */
+
+#ifndef RENDERSCRIPT_RS_FOR_EACH_RSH
+#define RENDERSCRIPT_RS_FOR_EACH_RSH
+
+/*
+ * rs_for_each_strategy_t: Suggested cell processing order
+ *
+ * This type is used to suggest how the invoked kernel should iterate over the cells of the
+ * allocations.  This is a hint only.  Implementations may not follow the suggestion.
+ *
+ * This specification can help the caching behavior of the running kernel, e.g. the cache
+ * locality when the processing is distributed over multiple cores.
+ */
+typedef enum rs_for_each_strategy {
+    RS_FOR_EACH_STRATEGY_SERIAL = 0, // Prefer contiguous memory regions.
+    RS_FOR_EACH_STRATEGY_DONT_CARE = 1, // No prefrences.
+    RS_FOR_EACH_STRATEGY_DST_LINEAR = 2, // Prefer DST.
+    RS_FOR_EACH_STRATEGY_TILE_SMALL = 3, // Prefer processing small rectangular regions.
+    RS_FOR_EACH_STRATEGY_TILE_MEDIUM = 4, // Prefer processing medium rectangular regions.
+    RS_FOR_EACH_STRATEGY_TILE_LARGE = 5 // Prefer processing large rectangular regions.
+} rs_for_each_strategy_t;
+
+/*
+ * rs_kernel_context: Handle to a kernel invocation context
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over, like dimensions, and rarely used indexes, like the Array0 index or the current
+ * level of detail.
+ *
+ * A kernel may be executed in parallel over multiple threads.  Each thread will have its
+ * own context.
+ *
+ * You can access the context by adding a special parameter named "context" and of type
+ * rs_kernel_context to your kernel function.  See rsGetDimX() and rsGetArray0() for examples.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef const struct rs_kernel_context_t * rs_kernel_context;
+#endif
+
+/*
+ * rs_script_call_t: Cell iteration information
+ *
+ * This structure is used to provide iteration information to a rsForEach call.
+ * It is currently used to restrict processing to a subset of cells.  In future
+ * versions, it will also be used to provide hint on how to best iterate over
+ * the cells.
+ *
+ * The Start fields are inclusive and the End fields are exclusive.  E.g. to iterate
+ * over cells 4, 5, 6, and 7 in the X dimension, set xStart to 4 and xEnd to 8.
+ */
+typedef struct rs_script_call {
+    rs_for_each_strategy_t strategy; // Currently ignored.  In the future, will be suggested cell iteration strategy.
+    uint32_t xStart; // Starting index in the X dimension.
+    uint32_t xEnd; // Ending index (exclusive) in the X dimension.
+    uint32_t yStart; // Starting index in the Y dimension.
+    uint32_t yEnd; // Ending index (exclusive) in the Y dimension.
+    uint32_t zStart; // Starting index in the Z dimension.
+    uint32_t zEnd; // Ending index (exclusive) in the Z dimension.
+    uint32_t arrayStart; // Starting index in the Array0 dimension.
+    uint32_t arrayEnd; // Ending index (exclusive) in the Array0 dimension.
+    uint32_t array1Start; // Starting index in the Array1 dimension.
+    uint32_t array1End; // Ending index (exclusive) in the Array1 dimension.
+    uint32_t array2Start; // Starting index in the Array2 dimension.
+    uint32_t array2End; // Ending index (exclusive) in the Array2 dimension.
+    uint32_t array3Start; // Starting index in the Array3 dimension.
+    uint32_t array3End; // Ending index (exclusive) in the Array3 dimension.
+} rs_script_call_t;
+
+/*
+ * rsForEach: Invoke the root kernel of a script
+ *
+ * Invoke the kernel named "root" of the specified script.  Like other kernels, this root()
+ * function will be invoked repeatedly over the cells of the specificed allocation, filling
+ * the output allocation with the results.
+ *
+ * When rsForEach is called, the root script is launched immediately.  rsForEach returns
+ * only when the script has completed and the output allocation is ready to use.
+ *
+ * The rs_script argument is typically initialized using a global variable set from Java.
+ *
+ * The kernel can be invoked with just an input allocation or just an output allocation.
+ * This can be done by defining an rs_allocation variable and not initializing it.  E.g.
+ * rs_script gCustomScript;
+ * void specializedProcessing(rs_allocation in) {
+ *   rs_allocation ignoredOut;
+ *   rsForEach(gCustomScript, in, ignoredOut);
+ * }
+ *
+ * If both input and output allocations are specified, they must have the same dimensions.
+ *
+ * Parameters:
+ *   script: Script to call.
+ *   input: Allocation to source data from.
+ *   output: Allocation to write date into.
+ *   usrData: User defined data to pass to the script.  May be NULL.
+ *   sc: Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL.
+ *   usrDataLen: Size of the userData structure.  This will be used to perform a shallow copy of the data if necessary.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              const rs_script_call_t* sc);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 20))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              size_t usrDataLen, const rs_script_call_t* sc);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 20))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              size_t usrDataLen);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output);
+#endif
+
+/*
+ * rsGetArray0: Index in the Array0 dimension for the specified context
+ *
+ * Returns the index in the Array0 dimension of the cell being processed, as specified
+ * by the supplied context.
+ *
+ * This context is created when a kernel is launched and updated at each iteration.
+ * It contains common characteristics of the allocations being iterated over and rarely
+ * used indexes, like the Array0 index.
+ *
+ * You can access the context by adding a special parameter named "context" and of
+ * type rs_kernel_context to your kernel function.  E.g.
+ * short RS_KERNEL myKernel(short value, uint32_t x, rs_kernel_context context) {
+ *   // The current index in the common x, y, z, w dimensions are accessed by
+ *   // adding these variables as arguments.  For the more rarely used indexes
+ *   // to the other dimensions, extract them from the context:
+ *   uint32_t index_a0 = rsGetArray0(context);
+ *   //...
+ * }
+ *
+ * This function returns 0 if the Array0 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray0(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray1: Index in the Array1 dimension for the specified context
+ *
+ * Returns the index in the Array1 dimension of the cell being processed, as specified
+ * by the supplied context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns 0 if the Array1 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray1(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray2: Index in the Array2 dimension for the specified context
+ *
+ * Returns the index in the Array2 dimension of the cell being processed,
+ * as specified by the supplied context.  See rsGetArray0() for an explanation
+ * of the context.
+ *
+ * Returns 0 if the Array2 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray2(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray3: Index in the Array3 dimension for the specified context
+ *
+ * Returns the index in the Array3 dimension of the cell being processed, as specified
+ * by the supplied context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns 0 if the Array3 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray3(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray0: Size of the Array0 dimension for the specified context
+ *
+ * Returns the size of the Array0 dimension for the specified context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array0 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray0(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray1: Size of the Array1 dimension for the specified context
+ *
+ * Returns the size of the Array1 dimension for the specified context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array1 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray1(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray2: Size of the Array2 dimension for the specified context
+ *
+ * Returns the size of the Array2 dimension for the specified context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array2 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray2(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray3: Size of the Array3 dimension for the specified context
+ *
+ * Returns the size of the Array3 dimension for the specified context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array3 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray3(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimHasFaces: Presence of more than one face for the specified context
+ *
+ * If the context refers to a cubemap, this function returns true if there's more than
+ * one face present.  In all other cases, it returns false.  See rsGetDimX() for an
+ * explanation of the context.
+ *
+ * rsAllocationGetDimFaces() is similar but returns 0 or 1 instead of a bool.
+ *
+ * Returns: Returns true if more than one face is present, false otherwise.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern bool __attribute__((overloadable))
+    rsGetDimHasFaces(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimLod: Number of levels of detail for the specified context
+ *
+ * Returns the number of levels of detail for the specified context.  This is useful
+ * for mipmaps.  See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if Level of Detail is not used.
+ *
+ * rsAllocationGetDimLOD() is similar but returns 0 or 1 instead the actual
+ * number of levels.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimLod(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimX: Size of the X dimension for the specified context
+ *
+ * Returns the size of the X dimension for the specified context.
+ *
+ * This context is created when a kernel is launched.  It contains common
+ * characteristics of the allocations being iterated over by the kernel in
+ * a very efficient structure.  It also contains rarely used indexes.
+ *
+ * You can access it by adding a special parameter named "context" and of
+ * type rs_kernel_context to your kernel function.  E.g.
+ * int4 RS_KERNEL myKernel(int4 value, rs_kernel_context context) {
+ *   uint32_t size = rsGetDimX(context); //...
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimX().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimX(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimY: Size of the Y dimension for the specified context
+ *
+ * Returns the size of the X dimension for the specified context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Y dimension is not present.
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimY().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimY(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimZ: Size of the Z dimension for the specified context
+ *
+ * Returns the size of the Z dimension for the specified context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Z dimension is not present.
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimZ().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimZ(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetFace: Coordinate of the Face for the specified context
+ *
+ * Returns the face on which the cell being processed is found, as specified by the
+ * supplied context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X if the face dimension is not
+ * present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern rs_allocation_cubemap_face __attribute__((overloadable))
+    rsGetFace(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetLod: Index in the Levels of Detail dimension for the specified context
+ *
+ * Returns the index in the Levels of Detail dimension of the cell being processed,
+ * as specified by the supplied context.  See rsGetArray0() for an explanation of
+ * the context.
+ *
+ * Returns 0 if the Levels of Detail dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetLod(rs_kernel_context context);
+#endif
+
+#endif // RENDERSCRIPT_RS_FOR_EACH_RSH

diff --git a/23.0.3/include/rs_graphics.rsh b/23.0.3/include/rs_graphics.rsh
new file mode 100644
index 0000000..2a78018
--- /dev/null
+++ b/23.0.3/include/rs_graphics.rsh

@@ -0,0 +1,1522 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_graphics.rsh: Graphics Functions and Types
+ *
+ * The graphics subsystem of RenderScript was removed at API level 23.
+ */
+
+#ifndef RENDERSCRIPT_RS_GRAPHICS_RSH
+#define RENDERSCRIPT_RS_GRAPHICS_RSH
+
+#ifdef __LP64__
+// TODO We need to fix some of the builds before enabling this error:
+// #error "RenderScript graphics is deprecated and not supported in 64bit mode."
+#endif
+
+// TODO we seem to assume order for the other headers too.
+#include "rs_object_types.rsh"
+
+/*
+ * rs_blend_src_func: Blend source function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_BLEND_SRC_ZERO = 0,
+    RS_BLEND_SRC_ONE = 1,
+    RS_BLEND_SRC_DST_COLOR = 2,
+    RS_BLEND_SRC_ONE_MINUS_DST_COLOR = 3,
+    RS_BLEND_SRC_SRC_ALPHA = 4,
+    RS_BLEND_SRC_ONE_MINUS_SRC_ALPHA = 5,
+    RS_BLEND_SRC_DST_ALPHA = 6,
+    RS_BLEND_SRC_ONE_MINUS_DST_ALPHA = 7,
+    RS_BLEND_SRC_SRC_ALPHA_SATURATE = 8,
+    RS_BLEND_SRC_INVALID = 100
+} rs_blend_src_func;
+#endif
+#endif
+
+/*
+ * rs_blend_dst_func: Blend destination function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_BLEND_DST_ZERO = 0,
+    RS_BLEND_DST_ONE = 1,
+    RS_BLEND_DST_SRC_COLOR = 2,
+    RS_BLEND_DST_ONE_MINUS_SRC_COLOR = 3,
+    RS_BLEND_DST_SRC_ALPHA = 4,
+    RS_BLEND_DST_ONE_MINUS_SRC_ALPHA = 5,
+    RS_BLEND_DST_DST_ALPHA = 6,
+    RS_BLEND_DST_ONE_MINUS_DST_ALPHA = 7,
+    RS_BLEND_DST_INVALID = 100
+} rs_blend_dst_func;
+#endif
+#endif
+
+/*
+ * rs_cull_mode: Culling mode
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_CULL_BACK = 0,
+    RS_CULL_FRONT = 1,
+    RS_CULL_NONE = 2,
+    RS_CULL_INVALID = 100
+} rs_cull_mode;
+#endif
+#endif
+
+/*
+ * rs_depth_func: Depth function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Specifies conditional drawing depending on the comparison of the incoming
+ * depth to that found in the depth buffer.
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_DEPTH_FUNC_ALWAYS = 0, // Always drawn
+    RS_DEPTH_FUNC_LESS = 1, // Drawn if the incoming depth value is less than that in the depth buffer
+    RS_DEPTH_FUNC_LEQUAL = 2, // Drawn if the incoming depth value is less or equal to that in the depth buffer
+    RS_DEPTH_FUNC_GREATER = 3, // Drawn if the incoming depth value is greater than that in the depth buffer
+    RS_DEPTH_FUNC_GEQUAL = 4, // Drawn if the incoming depth value is greater or equal to that in the depth buffer
+    RS_DEPTH_FUNC_EQUAL = 5, // Drawn if the incoming depth value is equal to that in the depth buffer
+    RS_DEPTH_FUNC_NOTEQUAL = 6, // Drawn if the incoming depth value is not equal to that in the depth buffer
+    RS_DEPTH_FUNC_INVALID = 100 // Invalid depth function
+} rs_depth_func;
+#endif
+#endif
+
+/*
+ * rs_primitive: How to intepret mesh vertex data
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Describes the way mesh vertex data is interpreted when rendering
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_PRIMITIVE_POINT = 0, // Vertex data will be rendered as a series of points
+    RS_PRIMITIVE_LINE = 1, // Vertex pairs will be rendered as lines
+    RS_PRIMITIVE_LINE_STRIP = 2, // Vertex data will be rendered as a connected line strip
+    RS_PRIMITIVE_TRIANGLE = 3, // Vertices will be rendered as individual triangles
+    RS_PRIMITIVE_TRIANGLE_STRIP = 4, // Vertices will be rendered as a connected triangle strip defined by the first three vertices with each additional triangle defined by a new vertex
+    RS_PRIMITIVE_TRIANGLE_FAN = 5, // Vertices will be rendered as a sequence of triangles that all share first vertex as the origin
+    RS_PRIMITIVE_INVALID = 100 // Invalid primitive
+} rs_primitive;
+#endif
+#endif
+
+/*
+ * rs_font: Handle to a Font
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript font object.
+ * See: android.renderscript.Font
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef _RS_HANDLE __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_font;
+#endif
+#endif
+
+/*
+ * rs_mesh: Handle to a Mesh
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript mesh object.
+ * See: android.renderscript.Mesh
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef _RS_HANDLE __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_mesh;
+#endif
+#endif
+
+/*
+ * rs_program_fragment: Handle to a ProgramFragment
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramFragment object.
+ * See: android.renderscript.ProgramFragment
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef _RS_HANDLE __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_fragment;
+#endif
+#endif
+
+/*
+ * rs_program_vertex: Handle to a ProgramVertex
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramVertex object.
+ * See: android.renderscript.ProgramVertex
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef _RS_HANDLE __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_vertex;
+#endif
+#endif
+
+/*
+ * rs_program_raster: Handle to a ProgramRaster
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramRaster object.
+ * See: android.renderscript.ProgramRaster
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef _RS_HANDLE __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_raster;
+#endif
+#endif
+
+/*
+ * rs_program_store: Handle to a ProgramStore
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramStore object.
+ * See: android.renderscript.ProgramStore
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef _RS_HANDLE __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_store;
+#endif
+#endif
+
+/*
+ * rsClearObject: Release an object
+ *
+ * Tells the run time that this handle will no longer be used to access the the related
+ * object.  If this was the last handle to that object, resource recovery may happen.
+ *
+ * After calling this function, *dst will be set to an empty handle.  See rsIsObject().
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_mesh* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_fragment* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_vertex* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_raster* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_store* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_font* dst);
+#endif
+#endif
+
+/*
+ * rsIsObject: Check for an empty handle
+ *
+ * Returns true if the handle contains a non-null reference.
+ *
+ * This function does not validate that the internal pointer used in the handle
+ * points to an actual valid object; it only checks for null.
+ *
+ * This function can be used to check the Element returned by rsElementGetSubElement()
+ * or see if rsClearObject() has been called on a handle.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_mesh v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_fragment v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_vertex v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_raster v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_store v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_font v);
+#endif
+#endif
+
+/*
+ * rsSetObject: For internal use.
+ *
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_mesh* dst, rs_mesh src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_fragment* dst, rs_program_fragment src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_vertex* dst, rs_program_vertex src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_raster* dst, rs_program_raster src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_store* dst, rs_program_store src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_font* dst, rs_font src);
+#endif
+#endif
+
+/*
+ * rsgAllocationSyncAll: Sync the contents of an allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Sync the contents of an allocation.
+ *
+ * If the source is specified, sync from memory space specified by source.
+ *
+ * If the source is not specified, sync from its SCRIPT memory space to its HW
+ * memory spaces.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgAllocationSyncAll(rs_allocation alloc);
+#endif
+#endif
+
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgAllocationSyncAll(rs_allocation alloc, rs_allocation_usage_type source);
+#endif
+#endif
+
+/*
+ * rsgBindColorTarget: Set the color target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the color target used for all subsequent rendering calls
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindColorTarget(rs_allocation colorTarget, uint slot);
+#endif
+#endif
+
+/*
+ * rsgBindConstant: Bind a constant allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Allocation object to a ProgramFragment or ProgramVertex.
+ * The Allocation must be a valid constant input for the Program.
+ *
+ * Parameters:
+ *   ps: program fragment object
+ *   slot: index of the constant buffer on the program
+ *   c: constants to bind
+ *   pv: program vertex object
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindConstant(rs_program_fragment ps, uint slot, rs_allocation c);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindConstant(rs_program_vertex pv, uint slot, rs_allocation c);
+#endif
+#endif
+
+/*
+ * rsgBindDepthTarget: Set the depth target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the depth target used for all subsequent rendering calls
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindDepthTarget(rs_allocation depthTarget);
+#endif
+#endif
+
+/*
+ * rsgBindFont: Bind a font object
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Binds the font object to be used for all subsequent font rendering calls
+ *
+ * Parameters:
+ *   font: object to bind
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindFont(rs_font font);
+#endif
+#endif
+
+/*
+ * rsgBindProgramFragment: Bind a ProgramFragment
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramFragment to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramFragment(rs_program_fragment pf);
+#endif
+#endif
+
+/*
+ * rsgBindProgramRaster: Bind a ProgramRaster
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramRaster to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramRaster(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgBindProgramStore: Bind a ProgramStore
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramStore to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramStore(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgBindProgramVertex: Bind a ProgramVertex
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramVertex to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramVertex(rs_program_vertex pv);
+#endif
+#endif
+
+/*
+ * rsgBindSampler: Bind a sampler
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Sampler object to a ProgramFragment.  The sampler will
+ * operate on the texture bound at the matching slot.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindSampler(rs_program_fragment fragment, uint slot, rs_sampler sampler);
+#endif
+#endif
+
+/*
+ * rsgBindTexture: Bind a texture allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid texture for the Program.  The sampling
+ * of the texture will be controled by the Sampler bound at the
+ * matching slot.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindTexture(rs_program_fragment v, uint slot, rs_allocation alloc);
+#endif
+#endif
+
+/*
+ * rsgClearAllRenderTargets: Clear all color and depth targets
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear all color and depth targets and resume rendering into
+ * the framebuffer
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearAllRenderTargets(void);
+#endif
+#endif
+
+/*
+ * rsgClearColor: Clear the specified color from the surface
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clears the rendering surface to the specified color.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearColor(float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgClearColorTarget: Clear the color target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear the previously set color target
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearColorTarget(uint slot);
+#endif
+#endif
+
+/*
+ * rsgClearDepth: Clear the depth surface
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clears the depth suface to the specified value.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearDepth(float value);
+#endif
+#endif
+
+/*
+ * rsgClearDepthTarget: Clear the depth target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear the previously set depth target
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearDepthTarget(void);
+#endif
+#endif
+
+/*
+ * rsgDrawMesh: Draw a mesh
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Draw a mesh using the current context state.
+ *
+ * If primitiveIndex is specified, draw part of a mesh using the current context state.
+ *
+ * If start and len are also specified, draw specified index range of part of a mesh using the current context state.
+ *
+ * Otherwise the whole mesh is rendered.
+ *
+ * Parameters:
+ *   ism: mesh object to render
+ *   primitiveIndex: for meshes that contain multiple primitive groups this parameter specifies the index of the group to draw.
+ *   start: starting index in the range
+ *   len: number of indices to draw
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex, uint start, uint len);
+#endif
+#endif
+
+/*
+ * rsgDrawQuad: Draw a quad
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a simple quad.  Not intended for
+ * drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawQuad(float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3,
+                float z3, float x4, float y4, float z4);
+#endif
+#endif
+
+/*
+ * rsgDrawQuadTexCoords: Draw a textured quad
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a textured quad.  Not intended
+ * for drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawQuadTexCoords(float x1, float y1, float z1, float u1, float v1, float x2, float y2,
+                         float z2, float u2, float v2, float x3, float y3, float z3, float u3,
+                         float v3, float x4, float y4, float z4, float u4, float v4);
+#endif
+#endif
+
+/*
+ * rsgDrawRect: Draw a rectangle
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a simple rectangle.  Not
+ * intended for drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawRect(float x1, float y1, float x2, float y2, float z);
+#endif
+#endif
+
+/*
+ * rsgDrawSpriteScreenspace: Draw rectangles in screenspace
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance function for drawing rectangles in screenspace.  This
+ * function uses the default passthough ProgramVertex.  Any bound ProgramVertex
+ * is ignored.  This function has considerable overhead and should not be used
+ * for drawing in shipping applications.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawSpriteScreenspace(float x, float y, float z, float w, float h);
+#endif
+#endif
+
+/*
+ * rsgDrawText: Draw a text string
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Draws text given a string and location
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawText(const char* text, int x, int y);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawText(rs_allocation alloc, int x, int y);
+#endif
+#endif
+
+/*
+ * rsgFinish: End rendering commands
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Force RenderScript to finish all rendering commands
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgFinish(void);
+#endif
+#endif
+
+/*
+ * rsgFontColor: Set the font color
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Sets the font color for all subsequent rendering calls
+ *
+ * Parameters:
+ *   r: red component
+ *   g: green component
+ *   b: blue component
+ *   a: alpha component
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgFontColor(float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgGetHeight: Get the surface height
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the height of the current rendering surface.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgGetHeight(void);
+#endif
+#endif
+
+/*
+ * rsgGetWidth: Get the surface width
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the width of the current rendering surface.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgGetWidth(void);
+#endif
+#endif
+
+/*
+ * rsgMeasureText: Get the bounding box for a text string
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the bounding box of the text relative to (0, 0)
+ * Any of left, right, top, bottom could be NULL
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeasureText(const char* text, int* left, int* right, int* top, int* bottom);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeasureText(rs_allocation alloc, int* left, int* right, int* top, int* bottom);
+#endif
+#endif
+
+/*
+ * rsgMeshComputeBoundingBox: Compute a bounding box
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Computes an axis aligned bounding box of a mesh object
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float* minX, float* minY, float* min, float* maxX,
+                              float* maxY, float* maxZ);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+static inline void __attribute__((always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float3* bBoxMin, float3* bBoxMax) {
+    float x1, y1, z1, x2, y2, z2;
+    rsgMeshComputeBoundingBox(mesh, &x1, &y1, &z1, &x2, &y2, &z2);
+    bBoxMin->x = x1;
+    bBoxMin->y = y1;
+    bBoxMin->z = z1;
+    bBoxMax->x = x2;
+    bBoxMax->y = y2;
+    bBoxMax->z = z2;
+}
+#endif
+#endif
+
+/*
+ * rsgMeshGetIndexAllocation: Return an allocation containing index data
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns an allocation containing index data or a null
+ * allocation if only the primitive is specified
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the index allocation
+ *
+ * Returns: allocation containing index data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetIndexAllocation(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetPrimitive: Return the primitive
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the primitive describing how a part of the mesh is
+ * rendered
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the primitive
+ *
+ * Returns: primitive describing how the mesh is rendered
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_primitive __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetPrimitive(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetPrimitiveCount: Return the number of index sets
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Meshes could have multiple index sets, this function returns
+ * the number.
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *
+ * Returns: number of primitive groups in the mesh. This would include simple primitives as well as allocations containing index data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint32_t __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetPrimitiveCount(rs_mesh m);
+#endif
+#endif
+
+/*
+ * rsgMeshGetVertexAllocation: Return a vertex allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns an allocation that is part of the mesh and contains
+ * vertex data, e.g. positions, normals, texcoords
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the vertex allocation
+ *
+ * Returns: allocation containing vertex data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetVertexAllocation(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetVertexAllocationCount: Return the number of vertex allocations
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the number of allocations in the mesh that contain
+ * vertex data
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *
+ * Returns: number of allocations in the mesh that contain vertex data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint32_t __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetVertexAllocationCount(rs_mesh m);
+#endif
+#endif
+
+/*
+ * rsgProgramFragmentConstantColor: Set the constant color for a fixed function emulation program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the constant color for a fixed function emulation program.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramFragmentConstantColor(rs_program_fragment pf, float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexGetProjectionMatrix: Get the projection matrix for a fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   proj: matrix to store the current projection matrix into
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexGetProjectionMatrix(rs_matrix4x4* proj);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadModelMatrix: Load the model matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the model matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   model: model matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadModelMatrix(const rs_matrix4x4* model);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadProjectionMatrix: Load the projection matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   proj: projection matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadProjectionMatrix(const rs_matrix4x4* proj);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadTextureMatrix: Load the texture matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the texture matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   tex: texture matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadTextureMatrix(const rs_matrix4x4* tex);
+#endif
+#endif
+
+/*
+ * rsgProgramRasterGetCullMode: Get program raster cull mode
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program raster cull mode
+ *
+ * Parameters:
+ *   pr: program raster to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_cull_mode __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramRasterGetCullMode(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgProgramRasterIsPointSpriteEnabled: Get program raster point sprite state
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program raster point sprite state
+ *
+ * Parameters:
+ *   pr: program raster to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramRasterIsPointSpriteEnabled(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetBlendDstFunc: Get program store blend destination function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blend destination function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_blend_dst_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetBlendDstFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetBlendSrcFunc: Get program store blend source function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blend source function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_blend_src_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetBlendSrcFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetDepthFunc: Get program store depth function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store depth function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_depth_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetDepthFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskAlphaEnabled: Get program store alpha component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store alpha component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskAlphaEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskBlueEnabled: Get program store blur component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blur component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskBlueEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskGreenEnabled: Get program store green component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store green component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskGreenEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskRedEnabled: Get program store red component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store red component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskRedEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsDepthMaskEnabled: Get program store depth mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store depth mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsDepthMaskEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsDitherEnabled: Get program store dither state
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store dither state
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsDitherEnabled(rs_program_store ps);
+#endif
+#endif
+
+#endif // RENDERSCRIPT_RS_GRAPHICS_RSH

diff --git a/23.0.3/include/rs_io.rsh b/23.0.3/include/rs_io.rsh
new file mode 100644
index 0000000..d14af11
--- /dev/null
+++ b/23.0.3/include/rs_io.rsh

@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_io.rsh: Input/Output Functions
+ *
+ * These functions are used to:
+ * - Send information to the Java client, and
+ * - Send the processed allocation or receive the next allocation to process.
+ */
+
+#ifndef RENDERSCRIPT_RS_IO_RSH
+#define RENDERSCRIPT_RS_IO_RSH
+
+/*
+ * rsAllocationIoReceive: Receive new content from the queue
+ *
+ * Receive a new set of contents from the queue.
+ *
+ * Parameters:
+ *   a: Allocation to work on.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern void __attribute__((overloadable))
+    rsAllocationIoReceive(rs_allocation a);
+#endif
+
+/*
+ * rsAllocationIoSend: Send new content to the queue
+ *
+ * Send the contents of the Allocation to the queue.
+ *
+ * Parameters:
+ *   a: Allocation to work on.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern void __attribute__((overloadable))
+    rsAllocationIoSend(rs_allocation a);
+#endif
+
+/*
+ * rsSendToClient: Send a message to the client, non-blocking
+ *
+ * Sends a message back to the client.  This call does not block.
+ * It returns true if the message was sent and false if the
+ * message queue is full.
+ *
+ * A message ID is required.  The data payload is optional.
+ *
+ * See RenderScript.RSMessageHandler.
+ *
+ * Parameters:
+ *   data: Application specific data.
+ *   len: Length of the data, in bytes.
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID);
+
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID, const void* data, uint len);
+
+/*
+ * rsSendToClientBlocking: Send a message to the client, blocking
+ *
+ * Sends a message back to the client.  This function will block
+ * until there is room on the message queue for this message.
+ * This function may return before the message was delivered and
+ * processed by the client.
+ *
+ * A message ID is required.  The data payload is optional.
+ *
+ * See RenderScript.RSMessageHandler.
+ *
+ * Parameters:
+ *   data: Application specific data.
+ *   len: Length of the data, in bytes.
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID);
+
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID, const void* data, uint len);
+
+#endif // RENDERSCRIPT_RS_IO_RSH

diff --git a/23.0.3/include/rs_math.rsh b/23.0.3/include/rs_math.rsh
new file mode 100644
index 0000000..9b39680
--- /dev/null
+++ b/23.0.3/include/rs_math.rsh

@@ -0,0 +1,4177 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_math.rsh: Mathematical Constants and Functions
+ *
+ * The mathematical functions below can be applied to scalars and vectors.   When applied
+ * to vectors, the returned value is a vector of the function applied to each entry of the input.
+ *
+ * For example:
+ * float3 a, b;
+ * // The following call sets
+ * //   a.x to sin(b.x),
+ * //   a.y to sin(b.y), and
+ * //   a.z to sin(b.z).
+ * a = sin(b);
+ *
+ *
+ * See Vector Math Functions for functions like distance() and length() that interpret
+ * instead the input as a single vector in n-dimensional space.
+ *
+ * The precision of the mathematical operations on 32 bit floats is affected by the pragmas
+ * rs_fp_relaxed and rs_fp_full.  Under rs_fp_relaxed, subnormal values may be flushed to zero and
+ * rounding may be done towards zero.  In comparison, rs_fp_full requires correct handling of
+ * subnormal values, i.e. smaller than 1.17549435e-38f.  rs_fp_rull also requires round to nearest
+ * with ties to even.
+ *
+ * Different precision/speed tradeoffs can be achieved by using variants of the common math
+ * functions.  Functions with a name starting with
+ * - native_: May have custom hardware implementations with weaker precision.  Additionally,
+ *   subnormal values may be flushed to zero, rounding towards zero may be used, and NaN and
+ *   infinity input may not be handled correctly.
+ * - half_: May perform internal computations using 16 bit floats.  Additionally, subnormal
+ *   values may be flushed to zero, and rounding towards zero may be used.
+ *
+ */
+
+#ifndef RENDERSCRIPT_RS_MATH_RSH
+#define RENDERSCRIPT_RS_MATH_RSH
+
+/*
+ * M_1_PI: 1 / pi, as a 32 bit float
+ *
+ * The inverse of pi, as a 32 bit float.
+ */
+#define M_1_PI 0.318309886183790671537767526745028724f
+
+/*
+ * M_2_PI: 2 / pi, as a 32 bit float
+ *
+ * 2 divided by pi, as a 32 bit float.
+ */
+#define M_2_PI 0.636619772367581343075535053490057448f
+
+/*
+ * M_2_PIl: 2 / pi, as a 32 bit float
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * 2 divided by pi, as a 32 bit float.
+ */
+#define M_2_PIl 0.636619772367581343075535053490057448f
+
+/*
+ * M_2_SQRTPI: 2 / sqrt(pi), as a 32 bit float
+ *
+ * 2 divided by the square root of pi, as a 32 bit float.
+ */
+#define M_2_SQRTPI 1.128379167095512573896158903121545172f
+
+/*
+ * M_E: e, as a 32 bit float
+ *
+ * The number e, the base of the natural logarithm, as a 32 bit float.
+ */
+#define M_E 2.718281828459045235360287471352662498f
+
+/*
+ * M_LN10: log_e(10), as a 32 bit float
+ *
+ * The natural logarithm of 10, as a 32 bit float.
+ */
+#define M_LN10 2.302585092994045684017991454684364208f
+
+/*
+ * M_LN2: log_e(2), as a 32 bit float
+ *
+ * The natural logarithm of 2, as a 32 bit float.
+ */
+#define M_LN2 0.693147180559945309417232121458176568f
+
+/*
+ * M_LOG10E: log_10(e), as a 32 bit float
+ *
+ * The logarithm base 10 of e, as a 32 bit float.
+ */
+#define M_LOG10E 0.434294481903251827651128918916605082f
+
+/*
+ * M_LOG2E: log_2(e), as a 32 bit float
+ *
+ * The logarithm base 2 of e, as a 32 bit float.
+ */
+#define M_LOG2E 1.442695040888963407359924681001892137f
+
+/*
+ * M_PI: pi, as a 32 bit float
+ *
+ * The constant pi, as a 32 bit float.
+ */
+#define M_PI 3.141592653589793238462643383279502884f
+
+/*
+ * M_PI_2: pi / 2, as a 32 bit float
+ *
+ * Pi divided by 2, as a 32 bit float.
+ */
+#define M_PI_2 1.570796326794896619231321691639751442f
+
+/*
+ * M_PI_4: pi / 4, as a 32 bit float
+ *
+ * Pi divided by 4, as a 32 bit float.
+ */
+#define M_PI_4 0.785398163397448309615660845819875721f
+
+/*
+ * M_SQRT1_2: 1 / sqrt(2), as a 32 bit float
+ *
+ * The inverse of the square root of 2, as a 32 bit float.
+ */
+#define M_SQRT1_2 0.707106781186547524400844362104849039f
+
+/*
+ * M_SQRT2: sqrt(2), as a 32 bit float
+ *
+ * The square root of 2, as a 32 bit float.
+ */
+#define M_SQRT2 1.414213562373095048801688724209698079f
+
+/*
+ * abs: Absolute value of an integer
+ *
+ * Returns the absolute value of an integer.
+ *
+ * For floats, use fabs().
+ */
+extern uchar __attribute__((const, overloadable))
+    abs(char v);
+
+extern uchar2 __attribute__((const, overloadable))
+    abs(char2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    abs(char3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    abs(char4 v);
+
+extern ushort __attribute__((const, overloadable))
+    abs(short v);
+
+extern ushort2 __attribute__((const, overloadable))
+    abs(short2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    abs(short3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    abs(short4 v);
+
+extern uint __attribute__((const, overloadable))
+    abs(int v);
+
+extern uint2 __attribute__((const, overloadable))
+    abs(int2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    abs(int3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    abs(int4 v);
+
+/*
+ * acos: Inverse cosine
+ *
+ * Returns the inverse cosine, in radians.
+ *
+ * See also native_acos().
+ */
+extern float __attribute__((const, overloadable))
+    acos(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acos(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acos(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acos(float4 v);
+
+/*
+ * acosh: Inverse hyperbolic cosine
+ *
+ * Returns the inverse hyperbolic cosine, in radians.
+ *
+ * See also native_acosh().
+ */
+extern float __attribute__((const, overloadable))
+    acosh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acosh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acosh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acosh(float4 v);
+
+/*
+ * acospi: Inverse cosine divided by pi
+ *
+ * Returns the inverse cosine in radians, divided by pi.
+ *
+ * To get an inverse cosine measured in degrees, use acospi(a) * 180.f.
+ *
+ * See also native_acospi().
+ */
+extern float __attribute__((const, overloadable))
+    acospi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acospi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acospi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acospi(float4 v);
+
+/*
+ * asin: Inverse sine
+ *
+ * Returns the inverse sine, in radians.
+ *
+ * See also native_asin().
+ */
+extern float __attribute__((const, overloadable))
+    asin(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asin(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asin(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asin(float4 v);
+
+/*
+ * asinh: Inverse hyperbolic sine
+ *
+ * Returns the inverse hyperbolic sine, in radians.
+ *
+ * See also native_asinh().
+ */
+extern float __attribute__((const, overloadable))
+    asinh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asinh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asinh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asinh(float4 v);
+
+/*
+ * asinpi: Inverse sine divided by pi
+ *
+ * Returns the inverse sine in radians, divided by pi.
+ *
+ * To get an inverse sine measured in degrees, use asinpi(a) * 180.f.
+ *
+ * See also native_asinpi().
+ */
+extern float __attribute__((const, overloadable))
+    asinpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asinpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asinpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asinpi(float4 v);
+
+/*
+ * atan: Inverse tangent
+ *
+ * Returns the inverse tangent, in radians.
+ *
+ * See also native_atan().
+ */
+extern float __attribute__((const, overloadable))
+    atan(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atan(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atan(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atan(float4 v);
+
+/*
+ * atan2: Inverse tangent of a ratio
+ *
+ * Returns the inverse tangent of (numerator / denominator), in radians.
+ *
+ * See also native_atan2().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+extern float __attribute__((const, overloadable))
+    atan2(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    atan2(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    atan2(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    atan2(float4 numerator, float4 denominator);
+
+/*
+ * atan2pi: Inverse tangent of a ratio, divided by pi
+ *
+ * Returns the inverse tangent of (numerator / denominator), in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atan2pi(n, d) * 180.f.
+ *
+ * See also native_atan2pi().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+extern float __attribute__((const, overloadable))
+    atan2pi(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    atan2pi(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    atan2pi(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    atan2pi(float4 numerator, float4 denominator);
+
+/*
+ * atanh: Inverse hyperbolic tangent
+ *
+ * Returns the inverse hyperbolic tangent, in radians.
+ *
+ * See also native_atanh().
+ */
+extern float __attribute__((const, overloadable))
+    atanh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atanh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atanh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atanh(float4 v);
+
+/*
+ * atanpi: Inverse tangent divided by pi
+ *
+ * Returns the inverse tangent in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atanpi(a) * 180.f.
+ *
+ * See also native_atanpi().
+ */
+extern float __attribute__((const, overloadable))
+    atanpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atanpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atanpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atanpi(float4 v);
+
+/*
+ * cbrt: Cube root
+ *
+ * Returns the cube root.
+ *
+ * See also native_cbrt().
+ */
+extern float __attribute__((const, overloadable))
+    cbrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cbrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cbrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cbrt(float4 v);
+
+/*
+ * ceil: Smallest integer not less than a value
+ *
+ * Returns the smallest integer not less than a value.
+ *
+ * For example, ceil(1.2f) returns 2.f, and ceil(-1.2f) returns -1.f.
+ *
+ * See also floor().
+ */
+extern float __attribute__((const, overloadable))
+    ceil(float v);
+
+extern float2 __attribute__((const, overloadable))
+    ceil(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    ceil(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    ceil(float4 v);
+
+/*
+ * clamp: Restrain a value to a range
+ *
+ * Clamps a value to a specified high and low bound.  clamp() returns min_value
+ * if value < min_value, max_value if value > max_value, otherwise value.
+ *
+ * There are two variants of clamp: one where the min and max are scalars applied
+ * to all entries of the value, the other where the min and max are also vectors.
+ *
+ * If min_value is greater than max_value, the results are undefined.
+ *
+ * Parameters:
+ *   value: Value to be clamped.
+ *   min_value: Lower bound, a scalar or matching vector.
+ *   max_value: High bound, must match the type of low.
+ */
+extern float __attribute__((const, overloadable))
+    clamp(float value, float min_value, float max_value);
+
+extern float2 __attribute__((const, overloadable))
+    clamp(float2 value, float2 min_value, float2 max_value);
+
+extern float3 __attribute__((const, overloadable))
+    clamp(float3 value, float3 min_value, float3 max_value);
+
+extern float4 __attribute__((const, overloadable))
+    clamp(float4 value, float4 min_value, float4 max_value);
+
+extern float2 __attribute__((const, overloadable))
+    clamp(float2 value, float min_value, float max_value);
+
+extern float3 __attribute__((const, overloadable))
+    clamp(float3 value, float min_value, float max_value);
+
+extern float4 __attribute__((const, overloadable))
+    clamp(float4 value, float min_value, float max_value);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char __attribute__((const, overloadable))
+    clamp(char value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char2 __attribute__((const, overloadable))
+    clamp(char2 value, char2 min_value, char2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char3 __attribute__((const, overloadable))
+    clamp(char3 value, char3 min_value, char3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char4 __attribute__((const, overloadable))
+    clamp(char4 value, char4 min_value, char4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar __attribute__((const, overloadable))
+    clamp(uchar value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar2 __attribute__((const, overloadable))
+    clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar3 __attribute__((const, overloadable))
+    clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar4 __attribute__((const, overloadable))
+    clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short __attribute__((const, overloadable))
+    clamp(short value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short2 __attribute__((const, overloadable))
+    clamp(short2 value, short2 min_value, short2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short3 __attribute__((const, overloadable))
+    clamp(short3 value, short3 min_value, short3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short4 __attribute__((const, overloadable))
+    clamp(short4 value, short4 min_value, short4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort __attribute__((const, overloadable))
+    clamp(ushort value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort2 __attribute__((const, overloadable))
+    clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort3 __attribute__((const, overloadable))
+    clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort4 __attribute__((const, overloadable))
+    clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int __attribute__((const, overloadable))
+    clamp(int value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int2 __attribute__((const, overloadable))
+    clamp(int2 value, int2 min_value, int2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int3 __attribute__((const, overloadable))
+    clamp(int3 value, int3 min_value, int3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int4 __attribute__((const, overloadable))
+    clamp(int4 value, int4 min_value, int4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint __attribute__((const, overloadable))
+    clamp(uint value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint2 __attribute__((const, overloadable))
+    clamp(uint2 value, uint2 min_value, uint2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint3 __attribute__((const, overloadable))
+    clamp(uint3 value, uint3 min_value, uint3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint4 __attribute__((const, overloadable))
+    clamp(uint4 value, uint4 min_value, uint4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long __attribute__((const, overloadable))
+    clamp(long value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long2 __attribute__((const, overloadable))
+    clamp(long2 value, long2 min_value, long2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long3 __attribute__((const, overloadable))
+    clamp(long3 value, long3 min_value, long3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long4 __attribute__((const, overloadable))
+    clamp(long4 value, long4 min_value, long4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong __attribute__((const, overloadable))
+    clamp(ulong value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong2 __attribute__((const, overloadable))
+    clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong3 __attribute__((const, overloadable))
+    clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong4 __attribute__((const, overloadable))
+    clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char2 __attribute__((const, overloadable))
+    clamp(char2 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char3 __attribute__((const, overloadable))
+    clamp(char3 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char4 __attribute__((const, overloadable))
+    clamp(char4 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar2 __attribute__((const, overloadable))
+    clamp(uchar2 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar3 __attribute__((const, overloadable))
+    clamp(uchar3 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar4 __attribute__((const, overloadable))
+    clamp(uchar4 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short2 __attribute__((const, overloadable))
+    clamp(short2 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short3 __attribute__((const, overloadable))
+    clamp(short3 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short4 __attribute__((const, overloadable))
+    clamp(short4 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort2 __attribute__((const, overloadable))
+    clamp(ushort2 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort3 __attribute__((const, overloadable))
+    clamp(ushort3 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort4 __attribute__((const, overloadable))
+    clamp(ushort4 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int2 __attribute__((const, overloadable))
+    clamp(int2 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int3 __attribute__((const, overloadable))
+    clamp(int3 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int4 __attribute__((const, overloadable))
+    clamp(int4 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint2 __attribute__((const, overloadable))
+    clamp(uint2 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint3 __attribute__((const, overloadable))
+    clamp(uint3 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint4 __attribute__((const, overloadable))
+    clamp(uint4 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long2 __attribute__((const, overloadable))
+    clamp(long2 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long3 __attribute__((const, overloadable))
+    clamp(long3 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long4 __attribute__((const, overloadable))
+    clamp(long4 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong2 __attribute__((const, overloadable))
+    clamp(ulong2 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong3 __attribute__((const, overloadable))
+    clamp(ulong3 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong4 __attribute__((const, overloadable))
+    clamp(ulong4 value, ulong min_value, ulong max_value);
+#endif
+
+/*
+ * clz: Number of leading 0 bits
+ *
+ * Returns the number of leading 0-bits in a value.
+ *
+ * For example, clz((char)0x03) returns 6.
+ */
+extern char __attribute__((const, overloadable))
+    clz(char value);
+
+extern char2 __attribute__((const, overloadable))
+    clz(char2 value);
+
+extern char3 __attribute__((const, overloadable))
+    clz(char3 value);
+
+extern char4 __attribute__((const, overloadable))
+    clz(char4 value);
+
+extern uchar __attribute__((const, overloadable))
+    clz(uchar value);
+
+extern uchar2 __attribute__((const, overloadable))
+    clz(uchar2 value);
+
+extern uchar3 __attribute__((const, overloadable))
+    clz(uchar3 value);
+
+extern uchar4 __attribute__((const, overloadable))
+    clz(uchar4 value);
+
+extern short __attribute__((const, overloadable))
+    clz(short value);
+
+extern short2 __attribute__((const, overloadable))
+    clz(short2 value);
+
+extern short3 __attribute__((const, overloadable))
+    clz(short3 value);
+
+extern short4 __attribute__((const, overloadable))
+    clz(short4 value);
+
+extern ushort __attribute__((const, overloadable))
+    clz(ushort value);
+
+extern ushort2 __attribute__((const, overloadable))
+    clz(ushort2 value);
+
+extern ushort3 __attribute__((const, overloadable))
+    clz(ushort3 value);
+
+extern ushort4 __attribute__((const, overloadable))
+    clz(ushort4 value);
+
+extern int __attribute__((const, overloadable))
+    clz(int value);
+
+extern int2 __attribute__((const, overloadable))
+    clz(int2 value);
+
+extern int3 __attribute__((const, overloadable))
+    clz(int3 value);
+
+extern int4 __attribute__((const, overloadable))
+    clz(int4 value);
+
+extern uint __attribute__((const, overloadable))
+    clz(uint value);
+
+extern uint2 __attribute__((const, overloadable))
+    clz(uint2 value);
+
+extern uint3 __attribute__((const, overloadable))
+    clz(uint3 value);
+
+extern uint4 __attribute__((const, overloadable))
+    clz(uint4 value);
+
+/*
+ * copysign: Copies the sign of a number to another
+ *
+ * Copies the sign from sign_value to magnitude_value.
+ *
+ * The value returned is either magnitude_value or -magnitude_value.
+ *
+ * For example, copysign(4.0f, -2.7f) returns -4.0f and copysign(-4.0f, 2.7f) returns 4.0f.
+ */
+extern float __attribute__((const, overloadable))
+    copysign(float magnitude_value, float sign_value);
+
+extern float2 __attribute__((const, overloadable))
+    copysign(float2 magnitude_value, float2 sign_value);
+
+extern float3 __attribute__((const, overloadable))
+    copysign(float3 magnitude_value, float3 sign_value);
+
+extern float4 __attribute__((const, overloadable))
+    copysign(float4 magnitude_value, float4 sign_value);
+
+/*
+ * cos: Cosine
+ *
+ * Returns the cosine of an angle measured in radians.
+ *
+ * See also native_cos().
+ */
+extern float __attribute__((const, overloadable))
+    cos(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cos(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cos(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cos(float4 v);
+
+/*
+ * cosh: Hypebolic cosine
+ *
+ * Returns the hypebolic cosine of v, where v is measured in radians.
+ *
+ * See also native_cosh().
+ */
+extern float __attribute__((const, overloadable))
+    cosh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cosh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cosh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cosh(float4 v);
+
+/*
+ * cospi: Cosine of a number multiplied by pi
+ *
+ * Returns the cosine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the cosine of a value measured in degrees, call cospi(v / 180.f).
+ *
+ * See also native_cospi().
+ */
+extern float __attribute__((const, overloadable))
+    cospi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cospi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cospi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cospi(float4 v);
+
+/*
+ * degrees: Converts radians into degrees
+ *
+ * Converts from radians to degrees.
+ */
+extern float __attribute__((const, overloadable))
+    degrees(float v);
+
+extern float2 __attribute__((const, overloadable))
+    degrees(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    degrees(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    degrees(float4 v);
+
+/*
+ * erf: Mathematical error function
+ *
+ * Returns the error function.
+ */
+extern float __attribute__((const, overloadable))
+    erf(float v);
+
+extern float2 __attribute__((const, overloadable))
+    erf(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    erf(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    erf(float4 v);
+
+/*
+ * erfc: Mathematical complementary error function
+ *
+ * Returns the complementary error function.
+ */
+extern float __attribute__((const, overloadable))
+    erfc(float v);
+
+extern float2 __attribute__((const, overloadable))
+    erfc(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    erfc(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    erfc(float4 v);
+
+/*
+ * exp: e raised to a number
+ *
+ * Returns e raised to v, i.e. e ^ v.
+ *
+ * See also native_exp().
+ */
+extern float __attribute__((const, overloadable))
+    exp(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp(float4 v);
+
+/*
+ * exp10: 10 raised to a number
+ *
+ * Returns 10 raised to v, i.e. 10.f ^ v.
+ *
+ * See also native_exp10().
+ */
+extern float __attribute__((const, overloadable))
+    exp10(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp10(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp10(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp10(float4 v);
+
+/*
+ * exp2: 2 raised to a number
+ *
+ * Returns 2 raised to v, i.e. 2.f ^ v.
+ *
+ * See also native_exp2().
+ */
+extern float __attribute__((const, overloadable))
+    exp2(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp2(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp2(float4 v);
+
+/*
+ * expm1: e raised to a number minus one
+ *
+ * Returns e raised to v minus 1, i.e. (e ^ v) - 1.
+ *
+ * See also native_expm1().
+ */
+extern float __attribute__((const, overloadable))
+    expm1(float v);
+
+extern float2 __attribute__((const, overloadable))
+    expm1(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    expm1(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    expm1(float4 v);
+
+/*
+ * fabs: Absolute value of a float
+ *
+ * Returns the absolute value of the float v.
+ *
+ * For integers, use abs().
+ */
+extern float __attribute__((const, overloadable))
+    fabs(float v);
+
+extern float2 __attribute__((const, overloadable))
+    fabs(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    fabs(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    fabs(float4 v);
+
+/*
+ * fdim: Positive difference between two values
+ *
+ * Returns the positive difference between two values.
+ *
+ * If a > b, returns (a - b) otherwise returns 0f.
+ */
+extern float __attribute__((const, overloadable))
+    fdim(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fdim(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fdim(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fdim(float4 a, float4 b);
+
+/*
+ * floor: Smallest integer not greater than a value
+ *
+ * Returns the smallest integer not greater than a value.
+ *
+ * For example, floor(1.2f) returns 1.f, and floor(-1.2f) returns -2.f.
+ *
+ * See also ceil().
+ */
+extern float __attribute__((const, overloadable))
+    floor(float v);
+
+extern float2 __attribute__((const, overloadable))
+    floor(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    floor(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    floor(float4 v);
+
+/*
+ * fma: Multiply and add
+ *
+ * Multiply and add.  Returns (multiplicand1 * multiplicand2) + offset.
+ *
+ * This function is similar to mad().  fma() retains full precision of the multiplied result
+ * and rounds only after the addition.  mad() rounds after the multiplication and the addition.
+ * This extra precision is not guaranteed in rs_fp_relaxed mode.
+ */
+extern float __attribute__((const, overloadable))
+    fma(float multiplicand1, float multiplicand2, float offset);
+
+extern float2 __attribute__((const, overloadable))
+    fma(float2 multiplicand1, float2 multiplicand2, float2 offset);
+
+extern float3 __attribute__((const, overloadable))
+    fma(float3 multiplicand1, float3 multiplicand2, float3 offset);
+
+extern float4 __attribute__((const, overloadable))
+    fma(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+/*
+ * fmax: Maximum of two floats
+ *
+ * Returns the maximum of a and b, i.e. (a < b ? b : a).
+ *
+ * The max() function returns identical results but can be applied to more data types.
+ */
+extern float __attribute__((const, overloadable))
+    fmax(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fmax(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fmax(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fmax(float4 a, float4 b);
+
+extern float2 __attribute__((const, overloadable))
+    fmax(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    fmax(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    fmax(float4 a, float b);
+
+/*
+ * fmin: Minimum of two floats
+ *
+ * Returns the minimum of a and b, i.e. (a > b ? b : a).
+ *
+ * The min() function returns identical results but can be applied to more data types.
+ */
+extern float __attribute__((const, overloadable))
+    fmin(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fmin(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fmin(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fmin(float4 a, float4 b);
+
+extern float2 __attribute__((const, overloadable))
+    fmin(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    fmin(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    fmin(float4 a, float b);
+
+/*
+ * fmod: Modulo
+ *
+ * Returns the remainder of (numerator / denominator), where the quotient is rounded towards zero.
+ *
+ * The function remainder() is similar but rounds toward the closest interger.
+ * For example, fmod(-3.8f, 2.f) returns -1.8f (-3.8f - -1.f * 2.f)
+ * while remainder(-3.8f, 2.f) returns 0.2f (-3.8f - -2.f * 2.f).
+ */
+extern float __attribute__((const, overloadable))
+    fmod(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    fmod(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    fmod(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    fmod(float4 numerator, float4 denominator);
+
+/*
+ * fract: Positive fractional part
+ *
+ * Returns the positive fractional part of v, i.e. v - floor(v).
+ *
+ * For example, fract(1.3f, &val) returns 0.3f and sets val to 1.f.
+ * fract(-1.3f, &val) returns 0.7f and sets val to -2.f.
+ *
+ * Parameters:
+ *   v: Input value.
+ *   floor: If floor is not null, *floor will be set to the floor of v.
+ */
+extern float __attribute__((overloadable))
+    fract(float v, float* floor);
+
+extern float2 __attribute__((overloadable))
+    fract(float2 v, float2* floor);
+
+extern float3 __attribute__((overloadable))
+    fract(float3 v, float3* floor);
+
+extern float4 __attribute__((overloadable))
+    fract(float4 v, float4* floor);
+
+static inline float __attribute__((const, overloadable))
+    fract(float v) {
+    float unused;
+    return fract(v, &unused);
+}
+
+static inline float2 __attribute__((const, overloadable))
+    fract(float2 v) {
+    float2 unused;
+    return fract(v, &unused);
+}
+
+static inline float3 __attribute__((const, overloadable))
+    fract(float3 v) {
+    float3 unused;
+    return fract(v, &unused);
+}
+
+static inline float4 __attribute__((const, overloadable))
+    fract(float4 v) {
+    float4 unused;
+    return fract(v, &unused);
+}
+
+/*
+ * frexp: Binary mantissa and exponent
+ *
+ * Returns the binary mantissa and exponent of v, i.e. v == mantissa * 2 ^ exponent.
+ *
+ * The mantissa is always between 0.5 (inclusive) and 1.0 (exclusive).
+ *
+ * See ldexp() for the reverse operation.  See also logb() and ilogb().
+ *
+ * Parameters:
+ *   v: Input value.
+ *   exponent: If exponent is not null, *exponent will be set to the exponent of v.
+ */
+extern float __attribute__((overloadable))
+    frexp(float v, int* exponent);
+
+extern float2 __attribute__((overloadable))
+    frexp(float2 v, int2* exponent);
+
+extern float3 __attribute__((overloadable))
+    frexp(float3 v, int3* exponent);
+
+extern float4 __attribute__((overloadable))
+    frexp(float4 v, int4* exponent);
+
+/*
+ * half_recip: Reciprocal computed to 16 bit precision
+ *
+ * Returns the approximate reciprocal of a value.
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also native_recip().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_recip(float4 v);
+#endif
+
+/*
+ * half_rsqrt: Reciprocal of a square root computed to 16 bit precision
+ *
+ * Returns the approximate value of (1.f / sqrt(value)).
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also rsqrt(), native_rsqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_rsqrt(float4 v);
+#endif
+
+/*
+ * half_sqrt: Square root computed to 16 bit precision
+ *
+ * Returns the approximate square root of a value.
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also sqrt(), native_sqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_sqrt(float4 v);
+#endif
+
+/*
+ * hypot: Hypotenuse
+ *
+ * Returns the hypotenuse, i.e. sqrt(a * a + b * b).
+ *
+ * See also native_hypot().
+ */
+extern float __attribute__((const, overloadable))
+    hypot(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    hypot(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    hypot(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    hypot(float4 a, float4 b);
+
+/*
+ * ilogb: Base two exponent
+ *
+ * Returns the base two exponent of a value, where the mantissa is between
+ * 1.f (inclusive) and 2.f (exclusive).
+ *
+ * For example, ilogb(8.5f) returns 3.
+ *
+ * Because of the difference in mantissa, this number is one less than is returned by frexp().
+ *
+ * logb() is similar but returns a float.
+ */
+extern int __attribute__((const, overloadable))
+    ilogb(float v);
+
+extern int2 __attribute__((const, overloadable))
+    ilogb(float2 v);
+
+extern int3 __attribute__((const, overloadable))
+    ilogb(float3 v);
+
+extern int4 __attribute__((const, overloadable))
+    ilogb(float4 v);
+
+/*
+ * ldexp: Creates a floating point from mantissa and exponent
+ *
+ * Returns the floating point created from the mantissa and exponent,
+ * i.e. (mantissa * 2 ^ exponent).
+ *
+ * See frexp() for the reverse operation.
+ *
+ * Parameters:
+ *   mantissa: Mantissa.
+ *   exponent: Exponent, a single component or matching vector.
+ */
+extern float __attribute__((const, overloadable))
+    ldexp(float mantissa, int exponent);
+
+extern float2 __attribute__((const, overloadable))
+    ldexp(float2 mantissa, int2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    ldexp(float3 mantissa, int3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    ldexp(float4 mantissa, int4 exponent);
+
+extern float2 __attribute__((const, overloadable))
+    ldexp(float2 mantissa, int exponent);
+
+extern float3 __attribute__((const, overloadable))
+    ldexp(float3 mantissa, int exponent);
+
+extern float4 __attribute__((const, overloadable))
+    ldexp(float4 mantissa, int exponent);
+
+/*
+ * lgamma: Natural logarithm of the gamma function
+ *
+ * Returns the natural logarithm of the absolute value of the gamma function,
+ * i.e. log(fabs(tgamma(v))).
+ *
+ * See also tgamma().
+ *
+ * Parameters:
+ *   sign_of_gamma: If sign_of_gamma is not null, *sign_of_gamma will be set to -1.f if the gamma of v is negative, otherwise to 1.f.
+ */
+extern float __attribute__((const, overloadable))
+    lgamma(float v);
+
+extern float2 __attribute__((const, overloadable))
+    lgamma(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    lgamma(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    lgamma(float4 v);
+
+extern float __attribute__((overloadable))
+    lgamma(float v, int* sign_of_gamma);
+
+extern float2 __attribute__((overloadable))
+    lgamma(float2 v, int2* sign_of_gamma);
+
+extern float3 __attribute__((overloadable))
+    lgamma(float3 v, int3* sign_of_gamma);
+
+extern float4 __attribute__((overloadable))
+    lgamma(float4 v, int4* sign_of_gamma);
+
+/*
+ * log: Natural logarithm
+ *
+ * Returns the natural logarithm.
+ *
+ * See also native_log().
+ */
+extern float __attribute__((const, overloadable))
+    log(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log(float4 v);
+
+/*
+ * log10: Base 10 logarithm
+ *
+ * Returns the base 10 logarithm.
+ *
+ * See also native_log10().
+ */
+extern float __attribute__((const, overloadable))
+    log10(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log10(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log10(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log10(float4 v);
+
+/*
+ * log1p: Natural logarithm of a value plus 1
+ *
+ * Returns the natural logarithm of (v + 1.f).
+ *
+ * See also native_log1p().
+ */
+extern float __attribute__((const, overloadable))
+    log1p(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log1p(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log1p(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log1p(float4 v);
+
+/*
+ * log2: Base 2 logarithm
+ *
+ * Returns the base 2 logarithm.
+ *
+ * See also native_log2().
+ */
+extern float __attribute__((const, overloadable))
+    log2(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log2(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log2(float4 v);
+
+/*
+ * logb: Base two exponent
+ *
+ * Returns the base two exponent of a value, where the mantissa is between
+ * 1.f (inclusive) and 2.f (exclusive).
+ *
+ * For example, logb(8.5f) returns 3.f.
+ *
+ * Because of the difference in mantissa, this number is one less than is returned by frexp().
+ *
+ * ilogb() is similar but returns an integer.
+ */
+extern float __attribute__((const, overloadable))
+    logb(float v);
+
+extern float2 __attribute__((const, overloadable))
+    logb(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    logb(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    logb(float4 v);
+
+/*
+ * mad: Multiply and add
+ *
+ * Multiply and add.  Returns (multiplicand1 * multiplicand2) + offset.
+ *
+ * This function is similar to fma().  fma() retains full precision of the multiplied result
+ * and rounds only after the addition.  mad() rounds after the multiplication and the addition.
+ * In rs_fp_relaxed mode, mad() may not do the rounding after multiplicaiton.
+ */
+extern float __attribute__((const, overloadable))
+    mad(float multiplicand1, float multiplicand2, float offset);
+
+extern float2 __attribute__((const, overloadable))
+    mad(float2 multiplicand1, float2 multiplicand2, float2 offset);
+
+extern float3 __attribute__((const, overloadable))
+    mad(float3 multiplicand1, float3 multiplicand2, float3 offset);
+
+extern float4 __attribute__((const, overloadable))
+    mad(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+/*
+ * max: Maximum
+ *
+ * Returns the maximum value of two arguments.
+ */
+extern float __attribute__((const, overloadable))
+    max(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    max(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    max(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    max(float4 a, float4 b);
+
+extern float2 __attribute__((const, overloadable))
+    max(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    max(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    max(float4 a, float b);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char __attribute__((const, overloadable))
+    max(char a, char b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar __attribute__((const, overloadable))
+    max(uchar a, uchar b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short __attribute__((const, overloadable))
+    max(short a, short b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort __attribute__((const, overloadable))
+    max(ushort a, ushort b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int __attribute__((const, overloadable))
+    max(int a, int b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint __attribute__((const, overloadable))
+    max(uint a, uint b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char2 __attribute__((const, overloadable))
+    max(char2 a, char2 b) {
+    char2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar2 __attribute__((const, overloadable))
+    max(uchar2 a, uchar2 b) {
+    uchar2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short2 __attribute__((const, overloadable))
+    max(short2 a, short2 b) {
+    short2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort2 __attribute__((const, overloadable))
+    max(ushort2 a, ushort2 b) {
+    ushort2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int2 __attribute__((const, overloadable))
+    max(int2 a, int2 b) {
+    int2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint2 __attribute__((const, overloadable))
+    max(uint2 a, uint2 b) {
+    uint2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char3 __attribute__((const, overloadable))
+    max(char3 a, char3 b) {
+    char3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar3 __attribute__((const, overloadable))
+    max(uchar3 a, uchar3 b) {
+    uchar3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short3 __attribute__((const, overloadable))
+    max(short3 a, short3 b) {
+    short3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort3 __attribute__((const, overloadable))
+    max(ushort3 a, ushort3 b) {
+    ushort3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int3 __attribute__((const, overloadable))
+    max(int3 a, int3 b) {
+    int3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint3 __attribute__((const, overloadable))
+    max(uint3 a, uint3 b) {
+    uint3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char4 __attribute__((const, overloadable))
+    max(char4 a, char4 b) {
+    char4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar4 __attribute__((const, overloadable))
+    max(uchar4 a, uchar4 b) {
+    uchar4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short4 __attribute__((const, overloadable))
+    max(short4 a, short4 b) {
+    short4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort4 __attribute__((const, overloadable))
+    max(ushort4 a, ushort4 b) {
+    ushort4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int4 __attribute__((const, overloadable))
+    max(int4 a, int4 b) {
+    int4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint4 __attribute__((const, overloadable))
+    max(uint4 a, uint4 b) {
+    uint4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char __attribute__((const, overloadable))
+    max(char a, char b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    max(char2 a, char2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    max(char3 a, char3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    max(char4 a, char4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar __attribute__((const, overloadable))
+    max(uchar a, uchar b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    max(uchar2 a, uchar2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    max(uchar3 a, uchar3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    max(uchar4 a, uchar4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short __attribute__((const, overloadable))
+    max(short a, short b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    max(short2 a, short2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    max(short3 a, short3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    max(short4 a, short4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort __attribute__((const, overloadable))
+    max(ushort a, ushort b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    max(ushort2 a, ushort2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    max(ushort3 a, ushort3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    max(ushort4 a, ushort4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int __attribute__((const, overloadable))
+    max(int a, int b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    max(int2 a, int2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    max(int3 a, int3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    max(int4 a, int4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint __attribute__((const, overloadable))
+    max(uint a, uint b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    max(uint2 a, uint2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    max(uint3 a, uint3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    max(uint4 a, uint4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long __attribute__((const, overloadable))
+    max(long a, long b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    max(long2 a, long2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    max(long3 a, long3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    max(long4 a, long4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong __attribute__((const, overloadable))
+    max(ulong a, ulong b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    max(ulong2 a, ulong2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    max(ulong3 a, ulong3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    max(ulong4 a, ulong4 b);
+#endif
+
+/*
+ * min: Minimum
+ *
+ * Returns the minimum value of two arguments.
+ */
+extern float __attribute__((const, overloadable))
+    min(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    min(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    min(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    min(float4 a, float4 b);
+
+extern float2 __attribute__((const, overloadable))
+    min(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    min(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    min(float4 a, float b);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char __attribute__((const, overloadable))
+    min(char a, char b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar __attribute__((const, overloadable))
+    min(uchar a, uchar b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short __attribute__((const, overloadable))
+    min(short a, short b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort __attribute__((const, overloadable))
+    min(ushort a, ushort b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int __attribute__((const, overloadable))
+    min(int a, int b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint __attribute__((const, overloadable))
+    min(uint a, uint b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char2 __attribute__((const, overloadable))
+    min(char2 a, char2 b) {
+    char2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar2 __attribute__((const, overloadable))
+    min(uchar2 a, uchar2 b) {
+    uchar2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short2 __attribute__((const, overloadable))
+    min(short2 a, short2 b) {
+    short2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort2 __attribute__((const, overloadable))
+    min(ushort2 a, ushort2 b) {
+    ushort2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int2 __attribute__((const, overloadable))
+    min(int2 a, int2 b) {
+    int2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint2 __attribute__((const, overloadable))
+    min(uint2 a, uint2 b) {
+    uint2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char3 __attribute__((const, overloadable))
+    min(char3 a, char3 b) {
+    char3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar3 __attribute__((const, overloadable))
+    min(uchar3 a, uchar3 b) {
+    uchar3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short3 __attribute__((const, overloadable))
+    min(short3 a, short3 b) {
+    short3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort3 __attribute__((const, overloadable))
+    min(ushort3 a, ushort3 b) {
+    ushort3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int3 __attribute__((const, overloadable))
+    min(int3 a, int3 b) {
+    int3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint3 __attribute__((const, overloadable))
+    min(uint3 a, uint3 b) {
+    uint3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char4 __attribute__((const, overloadable))
+    min(char4 a, char4 b) {
+    char4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar4 __attribute__((const, overloadable))
+    min(uchar4 a, uchar4 b) {
+    uchar4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short4 __attribute__((const, overloadable))
+    min(short4 a, short4 b) {
+    short4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort4 __attribute__((const, overloadable))
+    min(ushort4 a, ushort4 b) {
+    ushort4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int4 __attribute__((const, overloadable))
+    min(int4 a, int4 b) {
+    int4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint4 __attribute__((const, overloadable))
+    min(uint4 a, uint4 b) {
+    uint4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char __attribute__((const, overloadable))
+    min(char a, char b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    min(char2 a, char2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    min(char3 a, char3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    min(char4 a, char4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar __attribute__((const, overloadable))
+    min(uchar a, uchar b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    min(uchar2 a, uchar2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    min(uchar3 a, uchar3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    min(uchar4 a, uchar4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short __attribute__((const, overloadable))
+    min(short a, short b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    min(short2 a, short2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    min(short3 a, short3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    min(short4 a, short4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort __attribute__((const, overloadable))
+    min(ushort a, ushort b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    min(ushort2 a, ushort2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    min(ushort3 a, ushort3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    min(ushort4 a, ushort4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int __attribute__((const, overloadable))
+    min(int a, int b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    min(int2 a, int2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    min(int3 a, int3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    min(int4 a, int4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint __attribute__((const, overloadable))
+    min(uint a, uint b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    min(uint2 a, uint2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    min(uint3 a, uint3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    min(uint4 a, uint4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long __attribute__((const, overloadable))
+    min(long a, long b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    min(long2 a, long2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    min(long3 a, long3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    min(long4 a, long4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong __attribute__((const, overloadable))
+    min(ulong a, ulong b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    min(ulong2 a, ulong2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    min(ulong3 a, ulong3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    min(ulong4 a, ulong4 b);
+#endif
+
+/*
+ * mix: Mixes two values
+ *
+ * Returns start + ((stop - start) * fraction).
+ *
+ * This can be useful for mixing two values.  For example, to create a new color that is
+ * 40% color1 and 60% color2, use mix(color1, color2, 0.6f).
+ */
+extern float __attribute__((const, overloadable))
+    mix(float start, float stop, float fraction);
+
+extern float2 __attribute__((const, overloadable))
+    mix(float2 start, float2 stop, float2 fraction);
+
+extern float3 __attribute__((const, overloadable))
+    mix(float3 start, float3 stop, float3 fraction);
+
+extern float4 __attribute__((const, overloadable))
+    mix(float4 start, float4 stop, float4 fraction);
+
+extern float2 __attribute__((const, overloadable))
+    mix(float2 start, float2 stop, float fraction);
+
+extern float3 __attribute__((const, overloadable))
+    mix(float3 start, float3 stop, float fraction);
+
+extern float4 __attribute__((const, overloadable))
+    mix(float4 start, float4 stop, float fraction);
+
+/*
+ * modf: Integral and fractional components
+ *
+ * Returns the integral and fractional components of a number.
+ *
+ * Both components will have the same sign as x.  For example, for an input of -3.72f,
+ * iret will be set to -3.f and .72f will be returned.
+ *
+ * Parameters:
+ *   v: Source value.
+ *   integral_part: *integral_part will be set to the integral portion of the number.
+ *
+ * Returns: Floating point portion of the value.
+ */
+extern float __attribute__((overloadable))
+    modf(float v, float* integral_part);
+
+extern float2 __attribute__((overloadable))
+    modf(float2 v, float2* integral_part);
+
+extern float3 __attribute__((overloadable))
+    modf(float3 v, float3* integral_part);
+
+extern float4 __attribute__((overloadable))
+    modf(float4 v, float4* integral_part);
+
+/*
+ * nan: Not a Number
+ *
+ * Returns a NaN value (Not a Number).
+ *
+ * Parameters:
+ *   v: Not used.
+ */
+extern float __attribute__((const, overloadable))
+    nan(uint v);
+
+/*
+ * native_acos: Approximate inverse cosine
+ *
+ * Returns the approximate inverse cosine, in radians.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also acos().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acos(float4 v);
+#endif
+
+/*
+ * native_acosh: Approximate inverse hyperbolic cosine
+ *
+ * Returns the approximate inverse hyperbolic cosine, in radians.
+ *
+ * See also acosh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acosh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acosh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acosh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acosh(float4 v);
+#endif
+
+/*
+ * native_acospi: Approximate inverse cosine divided by pi
+ *
+ * Returns the approximate inverse cosine in radians, divided by pi.
+ *
+ * To get an inverse cosine measured in degrees, use acospi(a) * 180.f.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also acospi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acospi(float4 v);
+#endif
+
+/*
+ * native_asin: Approximate inverse sine
+ *
+ * Returns the approximate inverse sine, in radians.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also asin().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asin(float4 v);
+#endif
+
+/*
+ * native_asinh: Approximate inverse hyperbolic sine
+ *
+ * Returns the approximate inverse hyperbolic sine, in radians.
+ *
+ * See also asinh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asinh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asinh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asinh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asinh(float4 v);
+#endif
+
+/*
+ * native_asinpi: Approximate inverse sine divided by pi
+ *
+ * Returns the approximate inverse sine in radians, divided by pi.
+ *
+ * To get an inverse sine measured in degrees, use asinpi(a) * 180.f.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also asinpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asinpi(float4 v);
+#endif
+
+/*
+ * native_atan: Approximate inverse tangent
+ *
+ * Returns the approximate inverse tangent, in radians.
+ *
+ * See also atan().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan(float4 v);
+#endif
+
+/*
+ * native_atan2: Approximate inverse tangent of a ratio
+ *
+ * Returns the approximate inverse tangent of (numerator / denominator), in radians.
+ *
+ * See also atan2().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan2(float numerator, float denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan2(float2 numerator, float2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan2(float3 numerator, float3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan2(float4 numerator, float4 denominator);
+#endif
+
+/*
+ * native_atan2pi: Approximate inverse tangent of a ratio, divided by pi
+ *
+ * Returns the approximate inverse tangent of (numerator / denominator),
+ * in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atan2pi(n, d) * 180.f.
+ *
+ * See also atan2pi().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan2pi(float numerator, float denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan2pi(float2 numerator, float2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan2pi(float3 numerator, float3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan2pi(float4 numerator, float4 denominator);
+#endif
+
+/*
+ * native_atanh: Approximate inverse hyperbolic tangent
+ *
+ * Returns the approximate inverse hyperbolic tangent, in radians.
+ *
+ * See also atanh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atanh(float4 v);
+#endif
+
+/*
+ * native_atanpi: Approximate inverse tangent divided by pi
+ *
+ * Returns the approximate inverse tangent in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atanpi(a) * 180.f.
+ *
+ * See also atanpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atanpi(float4 v);
+#endif
+
+/*
+ * native_cbrt: Approximate cube root
+ *
+ * Returns the approximate cubic root.
+ *
+ * See also cbrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cbrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cbrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cbrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cbrt(float4 v);
+#endif
+
+/*
+ * native_cos: Approximate cosine
+ *
+ * Returns the approximate cosine of an angle measured in radians.
+ *
+ * See also cos().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cos(float4 v);
+#endif
+
+/*
+ * native_cosh: Approximate hypebolic cosine
+ *
+ * Returns the approximate hypebolic cosine.
+ *
+ * See also cosh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cosh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cosh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cosh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cosh(float4 v);
+#endif
+
+/*
+ * native_cospi: Approximate cosine of a number multiplied by pi
+ *
+ * Returns the approximate cosine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the cosine of a value measured in degrees, call cospi(v / 180.f).
+ *
+ * See also cospi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cospi(float4 v);
+#endif
+
+/*
+ * native_divide: Approximate division
+ *
+ * Computes the approximate division of two values.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_divide(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_divide(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_divide(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_divide(float4 left_vector, float4 right_vector);
+#endif
+
+/*
+ * native_exp: Approximate e raised to a number
+ *
+ * Fast approximate exp.
+ *
+ * It is valid for inputs from -86.f to 86.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp(float4 v);
+#endif
+
+/*
+ * native_exp10: Approximate 10 raised to a number
+ *
+ * Fast approximate exp10.
+ *
+ * It is valid for inputs from -37.f to 37.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp10().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp10(float4 v);
+#endif
+
+/*
+ * native_exp2: Approximate 2 raised to a number
+ *
+ * Fast approximate exp2.
+ *
+ * It is valid for inputs from -125.f to 125.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp2().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp2(float4 v);
+#endif
+
+/*
+ * native_expm1: Approximate e raised to a number minus one
+ *
+ * Returns the approximate (e ^ v) - 1.
+ *
+ * See also expm1().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_expm1(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_expm1(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_expm1(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_expm1(float4 v);
+#endif
+
+/*
+ * native_hypot: Approximate hypotenuse
+ *
+ * Returns the approximate native_sqrt(a * a + b * b)
+ *
+ * See also hypot().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_hypot(float a, float b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_hypot(float2 a, float2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_hypot(float3 a, float3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_hypot(float4 a, float4 b);
+#endif
+
+/*
+ * native_log: Approximate natural logarithm
+ *
+ * Fast approximate log.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log(float4 v);
+#endif
+
+/*
+ * native_log10: Approximate base 10 logarithm
+ *
+ * Fast approximate log10.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log10().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log10(float4 v);
+#endif
+
+/*
+ * native_log1p: Approximate natural logarithm of a value plus 1
+ *
+ * Returns the approximate natural logarithm of (v + 1.0f)
+ *
+ * See also log1p().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_log1p(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_log1p(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_log1p(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_log1p(float4 v);
+#endif
+
+/*
+ * native_log2: Approximate base 2 logarithm
+ *
+ * Fast approximate log2.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log2().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log2(float4 v);
+#endif
+
+/*
+ * native_powr: Approximate positive base raised to an exponent
+ *
+ * Fast approximate (base ^ exponent).
+ *
+ * See also powr().
+ *
+ * Parameters:
+ *   base: Must be between 0.f and 256.f.  The function is not accurate for values very close to zero.
+ *   exponent: Must be between -15.f and 15.f.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_powr(float base, float exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_powr(float2 base, float2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_powr(float3 base, float3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_powr(float4 base, float4 exponent);
+#endif
+
+/*
+ * native_recip: Approximate reciprocal
+ *
+ * Returns the approximate approximate reciprocal of a value.
+ *
+ * See also half_recip().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_recip(float4 v);
+#endif
+
+/*
+ * native_rootn: Approximate nth root
+ *
+ * Compute the approximate Nth root of a value.
+ *
+ * See also rootn().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_rootn(float4 v, int4 n);
+#endif
+
+/*
+ * native_rsqrt: Approximate reciprocal of a square root
+ *
+ * Returns approximate (1 / sqrt(v)).
+ *
+ * See also rsqrt(), half_rsqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_rsqrt(float4 v);
+#endif
+
+/*
+ * native_sin: Approximate sine
+ *
+ * Returns the approximate sine of an angle measured in radians.
+ *
+ * See also sin().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sin(float4 v);
+#endif
+
+/*
+ * native_sincos: Approximate sine and cosine
+ *
+ * Returns the approximate sine and cosine of a value.
+ *
+ * See also sincos().
+ *
+ * Parameters:
+ *   v: Incoming value in radians.
+ *   cos: *cos will be set to the cosine value.
+ *
+ * Returns: Sine.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((overloadable))
+    native_sincos(float v, float* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((overloadable))
+    native_sincos(float2 v, float2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((overloadable))
+    native_sincos(float3 v, float3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((overloadable))
+    native_sincos(float4 v, float4* cos);
+#endif
+
+/*
+ * native_sinh: Approximate hyperbolic sine
+ *
+ * Returns the approximate hyperbolic sine of a value specified in radians.
+ *
+ * See also sinh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sinh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sinh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sinh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sinh(float4 v);
+#endif
+
+/*
+ * native_sinpi: Approximate sine of a number multiplied by pi
+ *
+ * Returns the approximate sine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the sine of a value measured in degrees, call sinpi(v / 180.f).
+ *
+ * See also sinpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sinpi(float4 v);
+#endif
+
+/*
+ * native_sqrt: Approximate square root
+ *
+ * Returns the approximate sqrt(v).
+ *
+ * See also sqrt(), half_sqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sqrt(float4 v);
+#endif
+
+/*
+ * native_tan: Approximate tangent
+ *
+ * Returns the approximate tangent of an angle measured in radians.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tan(float4 v);
+#endif
+
+/*
+ * native_tanh: Approximate hyperbolic tangent
+ *
+ * Returns the approximate hyperbolic tangent of a value.
+ *
+ * See also tanh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tanh(float4 v);
+#endif
+
+/*
+ * native_tanpi: Approximate tangent of a number multiplied by pi
+ *
+ * Returns the approximate tangent of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the tangent of a value measured in degrees, call tanpi(v / 180.f).
+ *
+ * See also tanpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tanpi(float4 v);
+#endif
+
+/*
+ * nextafter: Next floating point number
+ *
+ * Returns the next representable floating point number from v towards target.
+ *
+ * In rs_fp_relaxed mode, a denormalized input value may not yield the next denormalized
+ * value, as support of denormalized values is optional in relaxed mode.
+ */
+extern float __attribute__((const, overloadable))
+    nextafter(float v, float target);
+
+extern float2 __attribute__((const, overloadable))
+    nextafter(float2 v, float2 target);
+
+extern float3 __attribute__((const, overloadable))
+    nextafter(float3 v, float3 target);
+
+extern float4 __attribute__((const, overloadable))
+    nextafter(float4 v, float4 target);
+
+/*
+ * pow: Base raised to an exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.
+ *
+ * pown() and powr() are similar.  pown() takes an integer exponent. powr() assumes the
+ * base to be non-negative.
+ */
+extern float __attribute__((const, overloadable))
+    pow(float base, float exponent);
+
+extern float2 __attribute__((const, overloadable))
+    pow(float2 base, float2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    pow(float3 base, float3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    pow(float4 base, float4 exponent);
+
+/*
+ * pown: Base raised to an integer exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.
+ *
+ * pow() and powr() are similar.  The both take a float exponent. powr() also assumes the
+ * base to be non-negative.
+ */
+extern float __attribute__((const, overloadable))
+    pown(float base, int exponent);
+
+extern float2 __attribute__((const, overloadable))
+    pown(float2 base, int2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    pown(float3 base, int3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    pown(float4 base, int4 exponent);
+
+/*
+ * powr: Positive base raised to an exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.  base must be >= 0.
+ *
+ * pow() and pown() are similar.  They both make no assumptions about the base.
+ * pow() takes a float exponent while pown() take an integer.
+ *
+ * See also native_powr().
+ */
+extern float __attribute__((const, overloadable))
+    powr(float base, float exponent);
+
+extern float2 __attribute__((const, overloadable))
+    powr(float2 base, float2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    powr(float3 base, float3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    powr(float4 base, float4 exponent);
+
+/*
+ * radians: Converts degrees into radians
+ *
+ * Converts from degrees to radians.
+ */
+extern float __attribute__((const, overloadable))
+    radians(float v);
+
+extern float2 __attribute__((const, overloadable))
+    radians(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    radians(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    radians(float4 v);
+
+/*
+ * remainder: Remainder of a division
+ *
+ * Returns the remainder of (numerator / denominator), where the quotient is rounded towards
+ * the nearest integer.
+ *
+ * The function fmod() is similar but rounds toward the closest interger.
+ * For example, fmod(-3.8f, 2.f) returns -1.8f (-3.8f - -1.f * 2.f)
+ * while remainder(-3.8f, 2.f) returns 0.2f (-3.8f - -2.f * 2.f).
+ */
+extern float __attribute__((const, overloadable))
+    remainder(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    remainder(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    remainder(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    remainder(float4 numerator, float4 denominator);
+
+/*
+ * remquo: Remainder and quotient of a division
+ *
+ * Returns the quotient and the remainder of (numerator / denominator).
+ *
+ * Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * This function is useful for implementing periodic functions.  The low three bits of the
+ * quotient gives the quadrant and the remainder the distance within the quadrant.
+ * For example, an implementation of sin(x) could call remquo(x, PI / 2.f, &quadrant)
+ * to reduce very large value of x to something within a limited range.
+ *
+ * Example: remquo(-23.5f, 8.f, &quot) sets the lowest three bits of quot to 3
+ * and the sign negative.  It returns 0.5f.
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.
+ *   quotient: *quotient will be set to the integer quotient.
+ *
+ * Returns: Remainder, precise only for the low three bits.
+ */
+extern float __attribute__((overloadable))
+    remquo(float numerator, float denominator, int* quotient);
+
+extern float2 __attribute__((overloadable))
+    remquo(float2 numerator, float2 denominator, int2* quotient);
+
+extern float3 __attribute__((overloadable))
+    remquo(float3 numerator, float3 denominator, int3* quotient);
+
+extern float4 __attribute__((overloadable))
+    remquo(float4 numerator, float4 denominator, int4* quotient);
+
+/*
+ * rint: Round to even
+ *
+ * Rounds to the nearest integral value.
+ *
+ * rint() rounds half values to even.  For example, rint(0.5f) returns 0.f and
+ * rint(1.5f) returns 2.f.  Similarly, rint(-0.5f) returns -0.f and
+ * rint(-1.5f) returns -2.f.
+ *
+ * round() is similar but rounds away from zero.  trunc() truncates the decimal fraction.
+ */
+extern float __attribute__((const, overloadable))
+    rint(float v);
+
+extern float2 __attribute__((const, overloadable))
+    rint(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    rint(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    rint(float4 v);
+
+/*
+ * rootn: Nth root
+ *
+ * Compute the Nth root of a value.
+ *
+ * See also native_rootn().
+ */
+extern float __attribute__((const, overloadable))
+    rootn(float v, int n);
+
+extern float2 __attribute__((const, overloadable))
+    rootn(float2 v, int2 n);
+
+extern float3 __attribute__((const, overloadable))
+    rootn(float3 v, int3 n);
+
+extern float4 __attribute__((const, overloadable))
+    rootn(float4 v, int4 n);
+
+/*
+ * round: Round away from zero
+ *
+ * Round to the nearest integral value.
+ *
+ * round() rounds half values away from zero.  For example, round(0.5f) returns 1.f
+ * and round(1.5f) returns 2.f.  Similarly, round(-0.5f) returns -1.f
+ * and round(-1.5f) returns -2.f.
+ *
+ * rint() is similar but rounds half values toward even.  trunc() truncates the decimal fraction.
+ */
+extern float __attribute__((const, overloadable))
+    round(float v);
+
+extern float2 __attribute__((const, overloadable))
+    round(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    round(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    round(float4 v);
+
+/*
+ * rsqrt: Reciprocal of a square root
+ *
+ * Returns (1 / sqrt(v)).
+ *
+ * See also half_rsqrt(), native_rsqrt().
+ */
+extern float __attribute__((const, overloadable))
+    rsqrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    rsqrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    rsqrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    rsqrt(float4 v);
+
+/*
+ * sign: Sign of a value
+ *
+ * Returns the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ */
+extern float __attribute__((const, overloadable))
+    sign(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sign(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sign(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sign(float4 v);
+
+/*
+ * sin: Sine
+ *
+ * Returns the sine of an angle measured in radians.
+ *
+ * See also native_sin().
+ */
+extern float __attribute__((const, overloadable))
+    sin(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sin(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sin(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sin(float4 v);
+
+/*
+ * sincos: Sine and cosine
+ *
+ * Returns the sine and cosine of a value.
+ *
+ * See also native_sincos().
+ *
+ * Parameters:
+ *   v: Incoming value in radians.
+ *   cos: *cos will be set to the cosine value.
+ *
+ * Returns: Sine of v.
+ */
+extern float __attribute__((overloadable))
+    sincos(float v, float* cos);
+
+extern float2 __attribute__((overloadable))
+    sincos(float2 v, float2* cos);
+
+extern float3 __attribute__((overloadable))
+    sincos(float3 v, float3* cos);
+
+extern float4 __attribute__((overloadable))
+    sincos(float4 v, float4* cos);
+
+/*
+ * sinh: Hyperbolic sine
+ *
+ * Returns the hyperbolic sine of v, where v is measured in radians.
+ *
+ * See also native_sinh().
+ */
+extern float __attribute__((const, overloadable))
+    sinh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sinh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sinh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sinh(float4 v);
+
+/*
+ * sinpi: Sine of a number multiplied by pi
+ *
+ * Returns the sine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the sine of a value measured in degrees, call sinpi(v / 180.f).
+ *
+ * See also native_sinpi().
+ */
+extern float __attribute__((const, overloadable))
+    sinpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sinpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sinpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sinpi(float4 v);
+
+/*
+ * sqrt: Square root
+ *
+ * Returns the square root of a value.
+ *
+ * See also half_sqrt(), native_sqrt().
+ */
+extern float __attribute__((const, overloadable))
+    sqrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sqrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sqrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sqrt(float4 v);
+
+/*
+ * step: 0 if less than a value, 0 otherwise
+ *
+ * Returns 0.f if v < edge, 1.f otherwise.
+ *
+ * This can be useful to create conditional computations without using loops and branching
+ * instructions.  For example, instead of computing (a[i] < b[i]) ? 0.f : atan2(a[i], b[i])
+ * for the corresponding elements of a vector, you could instead use step(a, b) * atan2(a, b).
+ */
+extern float __attribute__((const, overloadable))
+    step(float edge, float v);
+
+extern float2 __attribute__((const, overloadable))
+    step(float2 edge, float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    step(float3 edge, float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    step(float4 edge, float4 v);
+
+extern float2 __attribute__((const, overloadable))
+    step(float2 edge, float v);
+
+extern float3 __attribute__((const, overloadable))
+    step(float3 edge, float v);
+
+extern float4 __attribute__((const, overloadable))
+    step(float4 edge, float v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    step(float edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    step(float edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    step(float edge, float4 v);
+#endif
+
+/*
+ * tan: Tangent
+ *
+ * Returns the tangent of an angle measured in radians.
+ *
+ * See also native_tan().
+ */
+extern float __attribute__((const, overloadable))
+    tan(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tan(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tan(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tan(float4 v);
+
+/*
+ * tanh: Hyperbolic tangent
+ *
+ * Returns the hyperbolic tangent of a value.
+ *
+ * See also native_tanh().
+ */
+extern float __attribute__((const, overloadable))
+    tanh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tanh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tanh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tanh(float4 v);
+
+/*
+ * tanpi: Tangent of a number multiplied by pi
+ *
+ * Returns the tangent of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the tangent of a value measured in degrees, call tanpi(v / 180.f).
+ *
+ * See also native_tanpi().
+ */
+extern float __attribute__((const, overloadable))
+    tanpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tanpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tanpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tanpi(float4 v);
+
+/*
+ * tgamma: Gamma function
+ *
+ * Returns the gamma function of a value.
+ *
+ * See also lgamma().
+ */
+extern float __attribute__((const, overloadable))
+    tgamma(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tgamma(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tgamma(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tgamma(float4 v);
+
+/*
+ * trunc: Truncates a floating point
+ *
+ * Rounds to integral using truncation.
+ *
+ * For example, trunc(1.7f) returns 1.f and trunc(-1.7f) returns -1.f.
+ *
+ * See rint() and round() for other rounding options.
+ */
+extern float __attribute__((const, overloadable))
+    trunc(float v);
+
+extern float2 __attribute__((const, overloadable))
+    trunc(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    trunc(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    trunc(float4 v);
+
+/*
+ * rsClamp: Restrain a value to a range
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clamp a value between low and high.
+ *
+ * Parameters:
+ *   amount: Value to clamp.
+ *   low: Lower bound.
+ *   high: Upper bound.
+ */
+extern char __attribute__((const, always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(char amount, char low, char high);
+
+extern uchar __attribute__((const, always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(uchar amount, uchar low, uchar high);
+
+extern short __attribute__((const, always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(short amount, short low, short high);
+
+extern ushort __attribute__((const, always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(ushort amount, ushort low, ushort high);
+
+extern int __attribute__((const, always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(int amount, int low, int high);
+
+extern uint __attribute__((const, always_inline, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(uint amount, uint low, uint high);
+
+/*
+ * rsFrac: Returns the fractional part of a float
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the fractional part of a float
+ */
+extern float __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use fract() instead.")
+#endif
+))
+    rsFrac(float v);
+
+/*
+ * rsRand: Pseudo-random number
+ *
+ * Return a random value between 0 (or min_value) and max_malue.
+ */
+extern int __attribute__((overloadable))
+    rsRand(int max_value);
+
+extern int __attribute__((overloadable))
+    rsRand(int min_value, int max_value);
+
+extern float __attribute__((overloadable))
+    rsRand(float max_value);
+
+extern float __attribute__((overloadable))
+    rsRand(float min_value, float max_value);
+
+#endif // RENDERSCRIPT_RS_MATH_RSH

diff --git a/23.0.3/include/rs_matrix.rsh b/23.0.3/include/rs_matrix.rsh
new file mode 100644
index 0000000..aafc864
--- /dev/null
+++ b/23.0.3/include/rs_matrix.rsh

@@ -0,0 +1,596 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_matrix.rsh: Matrix Functions
+ *
+ * These functions let you manipulate square matrices of rank 2x2, 3x3, and 4x4.
+ * They are particularly useful for graphical transformations and are compatible
+ * with OpenGL.
+ *
+ * We use a zero-based index for rows and columns.  E.g. the last element of a
+ * rs_matrix4x4 is found at (3, 3).
+ *
+ * RenderScript uses column-major matrices and column-based vectors.  Transforming
+ * a vector is done by postmultiplying the vector, e.g. (matrix * vector),
+ * as provided by rsMatrixMultiply().
+ *
+ * To create a transformation matrix that performs two transformations at once,
+ * multiply the two source matrices, with the first transformation as the right
+ * argument.  E.g. to create a transformation matrix that applies the
+ * transformation s1 followed by s2, call rsMatrixLoadMultiply(&combined, &s2, &s1).
+ * This derives from s2 * (s1 * v), which is (s2 * s1) * v.
+ *
+ * We have two style of functions to create transformation matrices:
+ * rsMatrixLoadTransformation and rsMatrixTransformation.  The former
+ * style simply stores the transformation matrix in the first argument.  The latter
+ * modifies a pre-existing transformation matrix so that the new transformation
+ * happens first.  E.g. if you call rsMatrixTranslate() on a matrix that already
+ * does a scaling, the resulting matrix when applied to a vector will first do the
+ * translation then the scaling.
+ */
+
+#ifndef RENDERSCRIPT_RS_MATRIX_RSH
+#define RENDERSCRIPT_RS_MATRIX_RSH
+
+#include "rs_vector_math.rsh"
+
+/*
+ * rsExtractFrustumPlanes: Compute frustum planes
+ *
+ * Computes 6 frustum planes from the view projection matrix
+ *
+ * Parameters:
+ *   viewProj: Matrix to extract planes from.
+ *   left: Left plane.
+ *   right: Right plane.
+ *   top: Top plane.
+ *   bottom: Bottom plane.
+ *   near: Near plane.
+ *   far: Far plane.
+ */
+static inline void __attribute__((always_inline, overloadable))
+    rsExtractFrustumPlanes(const rs_matrix4x4* viewProj, float4* left, float4* right, float4* top,
+                           float4* bottom, float4* near, float4* far) {
+    // x y z w = a b c d in the plane equation
+    left->x = viewProj->m[3] + viewProj->m[0];
+    left->y = viewProj->m[7] + viewProj->m[4];
+    left->z = viewProj->m[11] + viewProj->m[8];
+    left->w = viewProj->m[15] + viewProj->m[12];
+
+    right->x = viewProj->m[3] - viewProj->m[0];
+    right->y = viewProj->m[7] - viewProj->m[4];
+    right->z = viewProj->m[11] - viewProj->m[8];
+    right->w = viewProj->m[15] - viewProj->m[12];
+
+    top->x = viewProj->m[3] - viewProj->m[1];
+    top->y = viewProj->m[7] - viewProj->m[5];
+    top->z = viewProj->m[11] - viewProj->m[9];
+    top->w = viewProj->m[15] - viewProj->m[13];
+
+    bottom->x = viewProj->m[3] + viewProj->m[1];
+    bottom->y = viewProj->m[7] + viewProj->m[5];
+    bottom->z = viewProj->m[11] + viewProj->m[9];
+    bottom->w = viewProj->m[15] + viewProj->m[13];
+
+    near->x = viewProj->m[3] + viewProj->m[2];
+    near->y = viewProj->m[7] + viewProj->m[6];
+    near->z = viewProj->m[11] + viewProj->m[10];
+    near->w = viewProj->m[15] + viewProj->m[14];
+
+    far->x = viewProj->m[3] - viewProj->m[2];
+    far->y = viewProj->m[7] - viewProj->m[6];
+    far->z = viewProj->m[11] - viewProj->m[10];
+    far->w = viewProj->m[15] - viewProj->m[14];
+
+    float len = length(left->xyz);
+    *left /= len;
+    len = length(right->xyz);
+    *right /= len;
+    len = length(top->xyz);
+    *top /= len;
+    len = length(bottom->xyz);
+    *bottom /= len;
+    len = length(near->xyz);
+    *near /= len;
+    len = length(far->xyz);
+    *far /= len;
+}
+
+/*
+ * rsIsSphereInFrustum: Checks if a sphere is within the frustum planes
+ *
+ * Returns true if the sphere is within the 6 frustum planes.
+ *
+ * Parameters:
+ *   sphere: float4 representing the sphere.
+ *   left: Left plane.
+ *   right: Right plane.
+ *   top: Top plane.
+ *   bottom: Bottom plane.
+ *   near: Near plane.
+ *   far: Far plane.
+ */
+static inline bool __attribute__((always_inline, overloadable))
+    rsIsSphereInFrustum(float4* sphere, float4* left, float4* right, float4* top, float4* bottom,
+                        float4* near, float4* far) {
+    float distToCenter = dot(left->xyz, sphere->xyz) + left->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(right->xyz, sphere->xyz) + right->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(top->xyz, sphere->xyz) + top->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(bottom->xyz, sphere->xyz) + bottom->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(near->xyz, sphere->xyz) + near->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(far->xyz, sphere->xyz) + far->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    return true;
+}
+
+/*
+ * rsMatrixGet: Get one element
+ *
+ * Returns one element of a matrix.
+ *
+ * Warning: The order of the column and row parameters may be unexpected.
+ *
+ * Parameters:
+ *   m: Matrix to extract the element from.
+ *   col: Zero-based column of the element to be extracted.
+ *   row: Zero-based row of the element to extracted.
+ */
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix4x4* m, uint32_t col, uint32_t row);
+
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix3x3* m, uint32_t col, uint32_t row);
+
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix2x2* m, uint32_t col, uint32_t row);
+
+/*
+ * rsMatrixInverse: Inverts a matrix in place
+ *
+ * Returns true if the matrix was successfully inverted.
+ *
+ * Parameters:
+ *   m: Matrix to invert.
+ */
+extern bool __attribute__((overloadable))
+    rsMatrixInverse(rs_matrix4x4* m);
+
+/*
+ * rsMatrixInverseTranspose: Inverts and transpose a matrix in place
+ *
+ * The matrix is first inverted then transposed. Returns true if the matrix was
+ * successfully inverted.
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ */
+extern bool __attribute__((overloadable))
+    rsMatrixInverseTranspose(rs_matrix4x4* m);
+
+/*
+ * rsMatrixLoad: Load or copy a matrix
+ *
+ * Set the elements of a matrix from an array of floats or from another matrix.
+ *
+ * If loading from an array, the floats should be in row-major order, i.e. the element a
+ * row 0, column 0 should be first, followed by the element at
+ * row 0, column 1, etc.
+ *
+ * If loading from a matrix and the source is smaller than the destination, the rest
+ * of the destination is filled with elements of the identity matrix.  E.g.
+ * loading a rs_matrix2x2 into a rs_matrix4x4 will give:
+ *
+ * m00 m01 0.0 0.0
+ * m10 m11 0.0 0.0
+ * 0.0 0.0 1.0 0.0
+ * 0.0 0.0 0.0 1.0
+ *
+ *
+ * Parameters:
+ *   destination: Matrix to set.
+ *   array: Array of values to set the matrix to. These arrays should be 4, 9, or 16 floats long, depending on the matrix size.
+ *   source: Source matrix.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix3x3* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix2x2* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix4x4* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix3x3* destination, const rs_matrix3x3* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix2x2* destination, const rs_matrix2x2* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix3x3* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix2x2* source);
+
+/*
+ * rsMatrixLoadFrustum: Load a frustum projection matrix
+ *
+ * Constructs a frustum projection matrix, transforming the box identified by
+ * the six clipping planes left, right, bottom, top, near, far.
+ *
+ * To apply this projection to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadFrustum(rs_matrix4x4* m, float left, float right, float bottom, float top,
+                        float near, float far);
+
+/*
+ * rsMatrixLoadIdentity: Load identity matrix
+ *
+ * Set the elements of a matrix to the identity matrix.
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix4x4* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix3x3* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix2x2* m);
+
+/*
+ * rsMatrixLoadMultiply: Multiply two matrices
+ *
+ * Sets m to the matrix product of lhs * rhs.
+ *
+ * To combine two 4x4 transformaton matrices, multiply the second transformation matrix
+ * by the first transformation matrix.  E.g. to create a transformation matrix that applies
+ * the transformation s1 followed by s2, call rsMatrixLoadMultiply(&combined, &s2, &s1).
+ *
+ * Warning: Prior to version 21, storing the result back into right matrix is not supported and
+ * will result in undefined behavior.  Use rsMatrixMulitply instead.   E.g. instead of doing
+ * rsMatrixLoadMultiply (&m2r, &m2r, &m2l), use rsMatrixMultiply (&m2r, &m2l).
+ * rsMatrixLoadMultiply (&m2l, &m2r, &m2l) works as expected.
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   lhs: Left matrix of the product.
+ *   rhs: Right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix4x4* m, const rs_matrix4x4* lhs, const rs_matrix4x4* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix3x3* m, const rs_matrix3x3* lhs, const rs_matrix3x3* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix2x2* m, const rs_matrix2x2* lhs, const rs_matrix2x2* rhs);
+
+/*
+ * rsMatrixLoadOrtho: Load an orthographic projection matrix
+ *
+ * Constructs an orthographic projection matrix, transforming the box identified by the
+ * six clipping planes left, right, bottom, top, near, far into a unit cube
+ * with a corner at (-1, -1, -1) and the opposite at (1, 1, 1).
+ *
+ * To apply this projection to a vector, multiply the vector by the created matrix
+ * using rsMatrixMultiply().
+ *
+ * See https://en.wikipedia.org/wiki/Orthographic_projection .
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadOrtho(rs_matrix4x4* m, float left, float right, float bottom, float top, float near,
+                      float far);
+
+/*
+ * rsMatrixLoadPerspective: Load a perspective projection matrix
+ *
+ * Constructs a perspective projection matrix, assuming a symmetrical field of view.
+ *
+ * To apply this projection to a vector, multiply the vector by the created matrix
+ * using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   fovy: Field of view, in degrees along the Y axis.
+ *   aspect: Ratio of x / y.
+ *   near: Near clipping plane.
+ *   far: Far clipping plane.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+/*
+ * rsMatrixLoadRotate: Load a rotation matrix
+ *
+ * This function creates a rotation matrix.  The axis of rotation is the (x, y, z) vector.
+ *
+ * To rotate a vector, multiply the vector by the created matrix using rsMatrixMultiply().
+ *
+ * See http://en.wikipedia.org/wiki/Rotation_matrix .
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   rot: How much rotation to do, in degrees.
+ *   x: X component of the vector that is the axis of rotation.
+ *   y: Y component of the vector that is the axis of rotation.
+ *   z: Z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+/*
+ * rsMatrixLoadScale: Load a scaling matrix
+ *
+ * This function creates a scaling matrix, where each component of a vector is multiplied
+ * by a number.  This number can be negative.
+ *
+ * To scale a vector, multiply the vector by the created matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   x: Multiple to scale the x components by.
+ *   y: Multiple to scale the y components by.
+ *   z: Multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadScale(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixLoadTranslate: Load a translation matrix
+ *
+ * This function creates a translation matrix, where a number is added to each element of
+ * a vector.
+ *
+ * To translate a vector, multiply the vector by the created matrix using
+ * rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   x: Number to add to each x component.
+ *   y: Number to add to each y component.
+ *   z: Number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixMultiply: Multiply a matrix by a vector or another matrix
+ *
+ * For the matrix by matrix variant, sets m to the matrix product m * rhs.
+ *
+ * When combining two 4x4 transformation matrices using this function, the resulting
+ * matrix will correspond to performing the rhs transformation first followed by
+ * the original m transformation.
+ *
+ * For the matrix by vector variant, returns the post-multiplication of the vector
+ * by the matrix, ie. m * in.
+ *
+ * When multiplying a float3 to a rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a float2 to a rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a float2 to a rs_matrix3x3, the vector is expanded with (0).
+ *
+ * Starting with API 14, this function takes a const matrix as the first argument.
+ *
+ * Parameters:
+ *   m: Left matrix of the product and the matrix to be set.
+ *   rhs: Right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, const rs_matrix4x4* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, const rs_matrix3x3* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix2x2* m, const rs_matrix2x2* rhs);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float4 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float3 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float2 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, float3 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, float2 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float2 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix2x2* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float4 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix3x3* m, float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix3x3* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float2 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix2x2* m, float2 in);
+#endif
+
+/*
+ * rsMatrixRotate: Apply a rotation to a transformation matrix
+ *
+ * Multiply the matrix m with a rotation matrix.
+ *
+ * This function modifies a transformation matrix to first do a rotation.  The axis of
+ * rotation is the (x, y, z) vector.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   rot: How much rotation to do, in degrees.
+ *   x: X component of the vector that is the axis of rotation.
+ *   y: Y component of the vector that is the axis of rotation.
+ *   z: Z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+/*
+ * rsMatrixScale: Apply a scaling to a transformation matrix
+ *
+ * Multiply the matrix m with a scaling matrix.
+ *
+ * This function modifies a transformation matrix to first do a scaling.   When scaling,
+ * each component of a vector is multiplied by a number.  This number can be negative.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   x: Multiple to scale the x components by.
+ *   y: Multiple to scale the y components by.
+ *   z: Multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixScale(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixSet: Set one element
+ *
+ * Set an element of a matrix.
+ *
+ * Warning: The order of the column and row parameters may be unexpected.
+ *
+ * Parameters:
+ *   m: Matrix that will be modified.
+ *   col: Zero-based column of the element to be set.
+ *   row: Zero-based row of the element to be set.
+ *   v: Value to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix4x4* m, uint32_t col, uint32_t row, float v);
+
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix3x3* m, uint32_t col, uint32_t row, float v);
+
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix2x2* m, uint32_t col, uint32_t row, float v);
+
+/*
+ * rsMatrixTranslate: Apply a translation to a transformation matrix
+ *
+ * Multiply the matrix m with a translation matrix.
+ *
+ * This function modifies a transformation matrix to first do a translation.  When
+ * translating, a number is added to each component of a vector.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the
+ * created matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   x: Number to add to each x component.
+ *   y: Number to add to each y component.
+ *   z: Number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixTranspose: Transpose a matrix place
+ *
+ * Transpose the matrix m in place.
+ *
+ * Parameters:
+ *   m: Matrix to transpose.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix4x4* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix3x3* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix2x2* m);
+
+#endif // RENDERSCRIPT_RS_MATRIX_RSH

diff --git a/23.0.3/include/rs_object_info.rsh b/23.0.3/include/rs_object_info.rsh
new file mode 100644
index 0000000..d4c40fe
--- /dev/null
+++ b/23.0.3/include/rs_object_info.rsh

@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_object_info.rsh: Object Characteristics Functions
+ *
+ * The functions below can be used to query the characteristics of an Allocation, Element,
+ * or Sampler object.  These objects are created from Java.  You can't create them from a
+ * script.
+ *
+ * Allocations:
+ *
+ * Allocations are the primary method used to pass data to and from RenderScript kernels.
+ *
+ * They are a structured collection of cells that can be used to store bitmaps, textures,
+ * arbitrary data points, etc.
+ *
+ * This collection of cells may have many dimensions (X, Y, Z, Array0, Array1, Array2, Array3),
+ * faces (for cubemaps), and level of details (for mipmapping).
+ *
+ * See the android.renderscript.Allocation for details on to create Allocations.
+ *
+ * Elements:
+ *
+ * The term "element" is used a bit ambiguously in RenderScript, as both type information
+ * for the cells of an Allocation and the instantiation of that type.  For example:
+ * - rs_element is a handle to a type specification, and
+ * - In functions like rsGetElementAt(), "element" means the instantiation of the type,
+ *     i.e. a cell of an Allocation.
+ *
+ * The functions below let you query the characteristics of the type specificiation.
+ *
+ * An Element can specify a simple data types as found in C, e.g. an integer, float, or
+ * boolean.  It can also specify a handle to a RenderScript object.  See rs_data_type for
+ * a list of basic types.
+ *
+ * Elements can specify fixed size vector (of size 2, 3, or 4) versions of the basic types.
+ * Elements can be grouped together into complex Elements, creating the equivalent of
+ * C structure definitions.
+ *
+ * Elements can also have a kind, which is semantic information used to interpret pixel
+ * data.  See rs_data_kind.
+ *
+ * When creating Allocations of common elements, you can simply use one of the many predefined
+ * Elements like F32_2.
+ *
+ * To create complex Elements, use the Element.Builder Java class.
+ *
+ * Samplers:
+ *
+ * Samplers objects define how Allocations can be read as structure within a kernel.
+ * See android.renderscript.S.
+ */
+
+#ifndef RENDERSCRIPT_RS_OBJECT_INFO_RSH
+#define RENDERSCRIPT_RS_OBJECT_INFO_RSH
+
+/*
+ * rsAllocationGetDimFaces: Presence of more than one face
+ *
+ * If the Allocation is a cubemap, this function returns 1 if there's more than one face
+ * present.  In all other cases, it returns 0.
+ *
+ * Use rsGetDimHasFaces() to get the dimension of a currently running kernel.
+ *
+ * Returns: Returns 1 if more than one face is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimFaces(rs_allocation a);
+
+/*
+ * rsAllocationGetDimLOD: Presence of levels of detail
+ *
+ * Query an Allocation for the presence of more than one Level Of Detail.  This is useful
+ * for mipmaps.
+ *
+ * Use rsGetDimLod() to get the dimension of a currently running kernel.
+ *
+ * Returns: Returns 1 if more than one LOD is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimLOD(rs_allocation a);
+
+/*
+ * rsAllocationGetDimX: Size of the X dimension
+ *
+ * Returns the size of the X dimension of the Allocation.
+ *
+ * Use rsGetDimX() to get the dimension of a currently running kernel.
+ *
+ * Returns: X dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimX(rs_allocation a);
+
+/*
+ * rsAllocationGetDimY: Size of the Y dimension
+ *
+ * Returns the size of the Y dimension of the Allocation.  If the Allocation has less
+ * than two dimensions, returns 0.
+ *
+ * Use rsGetDimY() to get the dimension of a currently running kernel.
+ *
+ * Returns: Y dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimY(rs_allocation a);
+
+/*
+ * rsAllocationGetDimZ: Size of the Z dimension
+ *
+ * Returns the size of the Z dimension of the Allocation.  If the Allocation has less
+ * than three dimensions, returns 0.
+ *
+ * Use rsGetDimZ() to get the dimension of a currently running kernel.
+ *
+ * Returns: Z dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimZ(rs_allocation a);
+
+/*
+ * rsAllocationGetElement: Get the object that describes the cell of an Allocation
+ *
+ * Get the Element object describing the type, kind, and other characteristics of a cell
+ * of an Allocation.  See the rsElement* functions below.
+ *
+ * Parameters:
+ *   a: Allocation to get data from.
+ *
+ * Returns: Element describing Allocation layout.
+ */
+extern rs_element __attribute__((overloadable))
+    rsAllocationGetElement(rs_allocation a);
+
+/*
+ * rsClearObject: Release an object
+ *
+ * Tells the run time that this handle will no longer be used to access the the related
+ * object.  If this was the last handle to that object, resource recovery may happen.
+ *
+ * After calling this function, *dst will be set to an empty handle.  See rsIsObject().
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_element* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_type* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_allocation* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_sampler* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_script* dst);
+
+/*
+ * rsIsObject: Check for an empty handle
+ *
+ * Returns true if the handle contains a non-null reference.
+ *
+ * This function does not validate that the internal pointer used in the handle
+ * points to an actual valid object; it only checks for null.
+ *
+ * This function can be used to check the Element returned by rsElementGetSubElement()
+ * or see if rsClearObject() has been called on a handle.
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_element v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_type v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_allocation v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_sampler v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_script v);
+
+/*
+ * rsElementGetBytesSize: Size of an Element
+ *
+ * Returns the size in bytes that an instantiation of this Element will occupy.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetBytesSize(rs_element e);
+#endif
+
+/*
+ * rsElementGetDataKind: Kind of an Element
+ *
+ * Returns the Element's data kind.  This is used to interpret pixel data.
+ *
+ * See rs_data_kind.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_data_kind __attribute__((overloadable))
+    rsElementGetDataKind(rs_element e);
+#endif
+
+/*
+ * rsElementGetDataType: Data type of an Element
+ *
+ * Returns the Element's base data type.  This can be a type similar to C/C++ (e.g.
+ * RS_TYPE_UNSIGNED_8), a handle (e.g. RS_TYPE_ALLOCATION and RS_TYPE_ELEMENT), or a
+ * more complex numerical type (e.g. RS_TYPE_UNSIGNED_5_6_5 and RS_TYPE_MATRIX_4X4).
+ * See rs_data_type.
+ *
+ * If the Element describes a vector, this function returns the data type of one of its items.
+ * Use rsElementGetVectorSize to get the size of the vector.
+ *
+ * If the Element describes a structure, RS_TYPE_NONE is returned.  Use the rsElementGetSub*
+ * functions to explore this complex Element.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_data_type __attribute__((overloadable))
+    rsElementGetDataType(rs_element e);
+#endif
+
+/*
+ * rsElementGetSubElement: Sub-element of a complex Element
+ *
+ * For Elements that represents a structure, this function returns the sub-element at the
+ * specified index.
+ *
+ * If the Element is not a structure or the index is greater or equal to the number of
+ * sub-elements, an invalid handle is returned.
+ *
+ * Parameters:
+ *   e: Element to query.
+ *   index: Index of the sub-element to return.
+ *
+ * Returns: Sub-element at the given index.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_element __attribute__((overloadable))
+    rsElementGetSubElement(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementArraySize: Array size of a sub-element of a complex Element
+ *
+ * For complex Elements, sub-elements can be statically sized arrays.  This function
+ * returns the array size of the sub-element at the index.  This sub-element repetition
+ * is different than fixed size vectors.
+ *
+ * Parameters:
+ *   e: Element to query.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Array size of the sub-element.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementArraySize(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementCount: Number of sub-elements
+ *
+ * Elements can be simple, such as an int or a float, or a structure with multiple
+ * sub-elements.  This function returns zero for simple Elements and the number of
+ * sub-elements for complex Elements.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *
+ * Returns: Number of sub-elements.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementCount(rs_element e);
+#endif
+
+/*
+ * rsElementGetSubElementName: Name of a sub-element
+ *
+ * For complex Elements, this function returns the name of the sub-element at the
+ * specified index.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *   name: Address of the array to store the name into.
+ *   nameLength: Length of the provided name array.
+ *
+ * Returns: Number of characters copied, excluding the null terminator.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementName(rs_element e, uint32_t index, char* name, uint32_t nameLength);
+#endif
+
+/*
+ * rsElementGetSubElementNameLength: Length of the name of a sub-element
+ *
+ * For complex Elements, this function returns the length of the name of the sub-element
+ * at the specified index.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Length of the sub-element name including the null terminator.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementNameLength(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementOffsetBytes: Offset of the instantiated sub-element
+ *
+ * This function returns the relative position of the instantiation of the specified
+ * sub-element within the instantiation of the Element.
+ *
+ * For example, if the Element describes a 32 bit float followed by a 32 bit integer,
+ * the offset return for the first will be 0 and the second 4.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Offset in bytes.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementOffsetBytes(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetVectorSize: Vector size of the Element
+ *
+ * Returns the Element's vector size.  If the Element does not represent a vector,
+ * 1 is returned.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *
+ * Returns: Length of the element vector.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetVectorSize(rs_element e);
+#endif
+
+/*
+ * rsGetAllocation: Return the Allocation for a given pointer
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the Allocation for a given pointer.  The pointer should point within a valid
+ * allocation.  The results are undefined if the pointer is not from a valid Allocation.
+ */
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("This function is deprecated and will be removed from the SDK in a future release.")
+#endif
+))
+    rsGetAllocation(const void* p);
+
+/*
+ * rsSamplerGetAnisotropy: Anisotropy of the Sampler
+ *
+ * Get the Sampler's anisotropy.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float __attribute__((overloadable))
+    rsSamplerGetAnisotropy(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetMagnification: Sampler magnification value
+ *
+ * Get the Sampler's magnification value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMagnification(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetMinification: Sampler minification value
+ *
+ * Get the Sampler's minification value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMinification(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetWrapS: Sampler wrap S value
+ *
+ * Get the Sampler's wrap S value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapS(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetWrapT: Sampler wrap T value
+ *
+ * Get the sampler's wrap T value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapT(rs_sampler s);
+#endif
+
+/*
+ * rsSetObject: For internal use.
+ *
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_element* dst, rs_element src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_type* dst, rs_type src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_allocation* dst, rs_allocation src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_sampler* dst, rs_sampler src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_script* dst, rs_script src);
+
+#endif // RENDERSCRIPT_RS_OBJECT_INFO_RSH

diff --git a/23.0.3/include/rs_object_types.rsh b/23.0.3/include/rs_object_types.rsh
new file mode 100644
index 0000000..9b11b32
--- /dev/null
+++ b/23.0.3/include/rs_object_types.rsh

@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_object_types.rsh: Object Types
+ *
+ * The types below are used to manipulate RenderScript objects like allocations, samplers,
+ * elements, and scripts.  Most of these object are created using the Java RenderScript APIs.
+ */
+
+#ifndef RENDERSCRIPT_RS_OBJECT_TYPES_RSH
+#define RENDERSCRIPT_RS_OBJECT_TYPES_RSH
+
+#define NULL ((void *)0)
+
+// Opaque handle to a RenderScript object. Do not use this directly.
+#ifndef __LP64__
+#define _RS_HANDLE \
+struct {\
+  const int* const p;\
+} __attribute__((packed, aligned(4)))
+#else
+#define _RS_HANDLE \
+struct {\
+  const long* const p;\
+  const long* const r;\
+  const long* const v1;\
+  const long* const v2;\
+}
+#endif
+
+/*
+ * rs_element: Handle to an element
+ *
+ * An opaque handle to a RenderScript element.
+ *
+ * See android.renderscript.Element.
+ */
+typedef _RS_HANDLE rs_element;
+
+/*
+ * rs_type: Handle to a Type
+ *
+ * An opaque handle to a RenderScript type.
+ *
+ * See android.renderscript.Type.
+ */
+typedef _RS_HANDLE rs_type;
+
+/*
+ * rs_allocation: Handle to an allocation
+ *
+ * An opaque handle to a RenderScript allocation.
+ *
+ * See android.renderscript.Allocation.
+ */
+typedef _RS_HANDLE rs_allocation;
+
+/*
+ * rs_sampler: Handle to a Sampler
+ *
+ * An opaque handle to a RenderScript sampler object.
+ *
+ * See android.renderscript.Sampler.
+ */
+typedef _RS_HANDLE rs_sampler;
+
+/*
+ * rs_script: Handle to a Script
+ *
+ * An opaque handle to a RenderScript script object.
+ *
+ * See android.renderscript.ScriptC.
+ */
+typedef _RS_HANDLE rs_script;
+
+/*
+ * rs_allocation_cubemap_face: Enum for selecting cube map faces
+ *
+ * An enum used to specify one the six faces of a cubemap.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+typedef enum {
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X = 0,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_X = 1,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Y = 2,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Y = 3,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Z = 4,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Z = 5
+} rs_allocation_cubemap_face;
+#endif
+
+/*
+ * rs_allocation_usage_type: Bitfield to specify how an allocation is used
+ *
+ * These values are ORed together to specify which usages or memory spaces are
+ * relevant to an allocation or an operation on an allocation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+typedef enum {
+    RS_ALLOCATION_USAGE_SCRIPT = 0x0001, // Allocation is bound to and accessed by scripts.
+    RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010, // Deprecated.
+    RS_ALLOCATION_USAGE_IO_INPUT = 0x0020, // Allocation is used as a Surface consumer.
+    RS_ALLOCATION_USAGE_IO_OUTPUT = 0x0040, // Allocation is used as a Surface producer.
+    RS_ALLOCATION_USAGE_SHARED = 0x0080 // Allocation's backing store is shared with another object (usually a Bitmap).  Copying to or from the original source Bitmap will cause a synchronization rather than a full copy.
+} rs_allocation_usage_type;
+#endif
+
+/*
+ * rs_data_type: Element basic data type
+ *
+ * rs_data_type is used to encode the type information of a basic element.
+ *
+ * RS_TYPE_UNSIGNED_5_6_5, RS_TYPE_UNSIGNED_5_5_5_1, RS_TYPE_UNSIGNED_4_4_4_4 are for packed
+ * graphical data formats and represent vectors with per vector member sizes which are treated
+ * as a single unit for packing and alignment purposes.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_TYPE_NONE = 0, // Element is a complex type, i.e. a struct.
+    RS_TYPE_FLOAT_32 = 2, // A 32 bit float point value.
+    RS_TYPE_FLOAT_64 = 3, // A 64 bit floating point value.
+    RS_TYPE_SIGNED_8 = 4, // An 8 bit signed integer.
+    RS_TYPE_SIGNED_16 = 5, // A 16 bit signed integer.
+    RS_TYPE_SIGNED_32 = 6, // A 32 bit signed integer.
+    RS_TYPE_SIGNED_64 = 7, // A 64 bit signed integer.
+    RS_TYPE_UNSIGNED_8 = 8, // An 8 bit unsigned integer.
+    RS_TYPE_UNSIGNED_16 = 9, // A 16 bit unsigned integer.
+    RS_TYPE_UNSIGNED_32 = 10, // A 32 bit unsigned integer.
+    RS_TYPE_UNSIGNED_64 = 11, // A 64 bit unsigned integer.
+    RS_TYPE_BOOLEAN = 12, // 0 or 1 (false or true) stored in an 8 bit container.
+    RS_TYPE_UNSIGNED_5_6_5 = 13, // A 16 bit unsigned integer packing graphical data in 5, 6, and 5 bit sections.
+    RS_TYPE_UNSIGNED_5_5_5_1 = 14, // A 16 bit unsigned integer packing graphical data in 5, 5, 5, and 1 bit sections.
+    RS_TYPE_UNSIGNED_4_4_4_4 = 15, // A 16 bit unsigned integer packing graphical data in 4, 4, 4, and 4 bit sections.
+    RS_TYPE_MATRIX_4X4 = 16, // A 4x4 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_MATRIX_3X3 = 17, // A 3x3 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_MATRIX_2X2 = 18, // A 2x2 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_ELEMENT = 1000, // A handle to an Element.
+    RS_TYPE_TYPE = 1001, // A handle to a Type.
+    RS_TYPE_ALLOCATION = 1002, // A handle to an Allocation.
+    RS_TYPE_SAMPLER = 1003, // A handle to a Sampler.
+    RS_TYPE_SCRIPT = 1004, // A handle to a Script.
+    RS_TYPE_MESH = 1005, // Deprecated.
+    RS_TYPE_PROGRAM_FRAGMENT = 1006, // Deprecated.
+    RS_TYPE_PROGRAM_VERTEX = 1007, // Deprecated.
+    RS_TYPE_PROGRAM_RASTER = 1008, // Deprecated.
+    RS_TYPE_PROGRAM_STORE = 1009, // Deprecated.
+    RS_TYPE_FONT = 1010, // Deprecated.
+    RS_TYPE_INVALID = 10000
+} rs_data_type;
+#endif
+
+/*
+ * rs_data_kind: Element data kind
+ *
+ * This enumeration is primarly useful for graphical data.  It provides additional information to
+ * help interpret the rs_data_type.
+ *
+ * RS_KIND_USER indicates no special interpretation is expected.
+ *
+ * The RS_KIND_PIXEL_* values are used in conjunction with the standard data types for representing
+ * texture formats.
+ *
+ * See the Element.createPixel() method.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_KIND_USER         = 0, // No special interpretation.
+    RS_KIND_PIXEL_L      = 7, // Luminance.
+    RS_KIND_PIXEL_A      = 8, // Alpha.
+    RS_KIND_PIXEL_LA     = 9, // Luminance and Alpha.
+    RS_KIND_PIXEL_RGB    = 10, // Red, Green, Blue.
+    RS_KIND_PIXEL_RGBA   = 11, // Red, Green, Blue, and Alpha.
+    RS_KIND_PIXEL_DEPTH  = 12, // Depth for a depth texture.
+    RS_KIND_PIXEL_YUV    = 13, // Luminance and chrominance.
+    RS_KIND_INVALID      = 100
+} rs_data_kind;
+#endif
+
+/*
+ * rs_sampler_value: Sampler wrap T value
+ *
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_SAMPLER_NEAREST = 0,
+    RS_SAMPLER_LINEAR = 1,
+    RS_SAMPLER_LINEAR_MIP_LINEAR = 2,
+    RS_SAMPLER_WRAP = 3,
+    RS_SAMPLER_CLAMP = 4,
+    RS_SAMPLER_LINEAR_MIP_NEAREST = 5,
+    RS_SAMPLER_MIRRORED_REPEAT = 6,
+    RS_SAMPLER_INVALID = 100
+} rs_sampler_value;
+#endif
+
+#endif // RENDERSCRIPT_RS_OBJECT_TYPES_RSH

diff --git a/23.0.3/include/rs_quaternion.rsh b/23.0.3/include/rs_quaternion.rsh
new file mode 100644
index 0000000..524eeb0
--- /dev/null
+++ b/23.0.3/include/rs_quaternion.rsh

@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_quaternion.rsh: Quaternion Functions
+ *
+ * The following functions manipulate quaternions.
+ */
+
+#ifndef RENDERSCRIPT_RS_QUATERNION_RSH
+#define RENDERSCRIPT_RS_QUATERNION_RSH
+
+/*
+ * rsQuaternionAdd: Add two quaternions
+ *
+ * Adds two quaternions, i.e. *q += *rhs;
+ *
+ * Parameters:
+ *   q: Destination quaternion to add to.
+ *   rhs: Quaternion to add.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionAdd(rs_quaternion* q, const rs_quaternion* rhs) {
+    q->w *= rhs->w;
+    q->x *= rhs->x;
+    q->y *= rhs->y;
+    q->z *= rhs->z;
+}
+
+/*
+ * rsQuaternionConjugate: Conjugate a quaternion
+ *
+ * Conjugates the quaternion.
+ *
+ * Parameters:
+ *   q: Quaternion to modify.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionConjugate(rs_quaternion* q) {
+    q->x = -q->x;
+    q->y = -q->y;
+    q->z = -q->z;
+}
+
+/*
+ * rsQuaternionDot: Dot product of two quaternions
+ *
+ * Returns the dot product of two quaternions.
+ *
+ * Parameters:
+ *   q0: First quaternion.
+ *   q1: Second quaternion.
+ */
+static inline float __attribute__((overloadable))
+    rsQuaternionDot(const rs_quaternion* q0, const rs_quaternion* q1) {
+    return q0->w*q1->w + q0->x*q1->x + q0->y*q1->y + q0->z*q1->z;
+}
+
+/*
+ * rsQuaternionGetMatrixUnit: Get a rotation matrix from a quaternion
+ *
+ * Computes a rotation matrix from the normalized quaternion.
+ *
+ * Parameters:
+ *   m: Resulting matrix.
+ *   q: Normalized quaternion.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionGetMatrixUnit(rs_matrix4x4* m, const rs_quaternion* q) {
+    float xx = q->x * q->x;
+    float xy = q->x * q->y;
+    float xz = q->x * q->z;
+    float xw = q->x * q->w;
+    float yy = q->y * q->y;
+    float yz = q->y * q->z;
+    float yw = q->y * q->w;
+    float zz = q->z * q->z;
+    float zw = q->z * q->w;
+
+    m->m[0]  = 1.0f - 2.0f * ( yy + zz );
+    m->m[4]  =        2.0f * ( xy - zw );
+    m->m[8]  =        2.0f * ( xz + yw );
+    m->m[1]  =        2.0f * ( xy + zw );
+    m->m[5]  = 1.0f - 2.0f * ( xx + zz );
+    m->m[9]  =        2.0f * ( yz - xw );
+    m->m[2]  =        2.0f * ( xz - yw );
+    m->m[6]  =        2.0f * ( yz + xw );
+    m->m[10] = 1.0f - 2.0f * ( xx + yy );
+    m->m[3]  = m->m[7] = m->m[11] = m->m[12] = m->m[13] = m->m[14] = 0.0f;
+    m->m[15] = 1.0f;
+}
+
+/*
+ * rsQuaternionLoadRotateUnit: Quaternion that represents a rotation about an arbitrary unit vector
+ *
+ * Loads a quaternion that represents a rotation about an arbitrary unit vector.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   rot: Angle to rotate by, in radians.
+ *   x: X component of the vector.
+ *   y: Y component of the vector.
+ *   z: Z component of the vector.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z) {
+    rot *= (float)(M_PI / 180.0f) * 0.5f;
+    float c = cos(rot);
+    float s = sin(rot);
+
+    q->w = c;
+    q->x = x * s;
+    q->y = y * s;
+    q->z = z * s;
+}
+
+/*
+ * rsQuaternionSet: Create a quaternion
+ *
+ * Creates a quaternion from its four components or from another quaternion.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   w: W component.
+ *   x: X component.
+ *   y: Y component.
+ *   z: Z component.
+ *   rhs: Source quaternion.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z) {
+    q->w = w;
+    q->x = x;
+    q->y = y;
+    q->z = z;
+}
+
+static inline void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, const rs_quaternion* rhs) {
+    q->w = rhs->w;
+    q->x = rhs->x;
+    q->y = rhs->y;
+    q->z = rhs->z;
+}
+
+/*
+ * rsQuaternionLoadRotate: Create a rotation quaternion
+ *
+ * Loads a quaternion that represents a rotation about an arbitrary vector
+ * (doesn't have to be unit)
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   rot: Angle to rotate by.
+ *   x: X component of a vector.
+ *   y: Y component of a vector.
+ *   z: Z component of a vector.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z) {
+    const float len = x*x + y*y + z*z;
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        x *= recipLen;
+        y *= recipLen;
+        z *= recipLen;
+    }
+    rsQuaternionLoadRotateUnit(q, rot, x, y, z);
+}
+
+/*
+ * rsQuaternionNormalize: Normalize a quaternion
+ *
+ * Normalizes the quaternion.
+ *
+ * Parameters:
+ *   q: Quaternion to normalize.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionNormalize(rs_quaternion* q) {
+    const float len = rsQuaternionDot(q, q);
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        q->w *= recipLen;
+        q->x *= recipLen;
+        q->y *= recipLen;
+        q->z *= recipLen;
+    }
+}
+
+/*
+ * rsQuaternionMultiply: Multiply a quaternion by a scalar or another quaternion
+ *
+ * Multiplies a quaternion by a scalar or by another quaternion, e.g
+ * *q = *q * scalar; or *q = *q * *rhs;.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   scalar: Scalar to multiply the quaternion by.
+ *   rhs: Quaternion to multiply the destination quaternion by.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, float scalar) {
+    q->w *= scalar;
+    q->x *= scalar;
+    q->y *= scalar;
+    q->z *= scalar;
+}
+
+static inline void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, const rs_quaternion* rhs) {
+    rs_quaternion qtmp;
+    rsQuaternionSet(&qtmp, q);
+
+    q->w = qtmp.w*rhs->w - qtmp.x*rhs->x - qtmp.y*rhs->y - qtmp.z*rhs->z;
+    q->x = qtmp.w*rhs->x + qtmp.x*rhs->w + qtmp.y*rhs->z - qtmp.z*rhs->y;
+    q->y = qtmp.w*rhs->y + qtmp.y*rhs->w + qtmp.z*rhs->x - qtmp.x*rhs->z;
+    q->z = qtmp.w*rhs->z + qtmp.z*rhs->w + qtmp.x*rhs->y - qtmp.y*rhs->x;
+    rsQuaternionNormalize(q);
+}
+
+/*
+ * rsQuaternionSlerp: Spherical linear interpolation between two quaternions
+ *
+ * Performs spherical linear interpolation between two quaternions.
+ *
+ * Parameters:
+ *   q: Result quaternion from the interpolation.
+ *   q0: First input quaternion.
+ *   q1: Second input quaternion.
+ *   t: How much to interpolate by.
+ */
+static inline void __attribute__((overloadable))
+    rsQuaternionSlerp(rs_quaternion* q, const rs_quaternion* q0, const rs_quaternion* q1, float t) {
+    if (t <= 0.0f) {
+        rsQuaternionSet(q, q0);
+        return;
+    }
+    if (t >= 1.0f) {
+        rsQuaternionSet(q, q1);
+        return;
+    }
+
+    rs_quaternion tempq0, tempq1;
+    rsQuaternionSet(&tempq0, q0);
+    rsQuaternionSet(&tempq1, q1);
+
+    float angle = rsQuaternionDot(q0, q1);
+    if (angle < 0) {
+        rsQuaternionMultiply(&tempq0, -1.0f);
+        angle *= -1.0f;
+    }
+
+    float scale, invScale;
+    if (angle + 1.0f > 0.05f) {
+        if (1.0f - angle >= 0.05f) {
+            float theta = acos(angle);
+            float invSinTheta = 1.0f / sin(theta);
+            scale = sin(theta * (1.0f - t)) * invSinTheta;
+            invScale = sin(theta * t) * invSinTheta;
+        } else {
+            scale = 1.0f - t;
+            invScale = t;
+        }
+    } else {
+        rsQuaternionSet(&tempq1, tempq0.z, -tempq0.y, tempq0.x, -tempq0.w);
+        scale = sin(M_PI * (0.5f - t));
+        invScale = sin(M_PI * t);
+    }
+
+    rsQuaternionSet(q, tempq0.w*scale + tempq1.w*invScale, tempq0.x*scale + tempq1.x*invScale,
+                        tempq0.y*scale + tempq1.y*invScale, tempq0.z*scale + tempq1.z*invScale);
+}
+
+#endif // RENDERSCRIPT_RS_QUATERNION_RSH

diff --git a/23.0.3/include/rs_time.rsh b/23.0.3/include/rs_time.rsh
new file mode 100644
index 0000000..05987f8
--- /dev/null
+++ b/23.0.3/include/rs_time.rsh

@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_time.rsh: Time Functions and Types
+ *
+ * The functions below can be used to tell the current clock time and the current
+ * system up time.  It is not recommended to call these functions inside of a kernel.
+ */
+
+#ifndef RENDERSCRIPT_RS_TIME_RSH
+#define RENDERSCRIPT_RS_TIME_RSH
+
+/*
+ * rs_time_t: Seconds since January 1, 1970
+ *
+ * Calendar time interpreted as seconds elapsed since the Epoch (00:00:00 on
+ * January 1, 1970, Coordinated Universal Time (UTC)).
+ */
+#ifndef __LP64__
+typedef int rs_time_t;
+#endif
+
+#ifdef __LP64__
+typedef long rs_time_t;
+#endif
+
+/*
+ * rs_tm: Date and time structure
+ *
+ * Data structure for broken-down time components.
+ */
+typedef struct {
+    int tm_sec; // Seconds after the minute. This ranges from 0 to 59, but possibly up to 60 for leap seconds.
+    int tm_min; // Minutes after the hour. This ranges from 0 to 59.
+    int tm_hour; // Hours past midnight. This ranges from 0 to 23.
+    int tm_mday; // Day of the month. This ranges from 1 to 31.
+    int tm_mon; // Months since January. This ranges from 0 to 11.
+    int tm_year; // Years since 1900.
+    int tm_wday; // Days since Sunday. This ranges from 0 to 6.
+    int tm_yday; // Days since January 1. This ranges from 0 to 365.
+    int tm_isdst; // Flag to indicate whether daylight saving time is in effect. The value is positive if it is in effect, zero if it is not, and negative if the information is not available.
+} rs_tm;
+
+/*
+ * rsGetDt: Elapsed time since last call
+ *
+ * Returns the time in seconds since this function was last called in this script.
+ *
+ * Returns: Time in seconds.
+ */
+extern float __attribute__((overloadable))
+    rsGetDt(void);
+
+/*
+ * rsLocaltime: Convert to local time
+ *
+ * Converts the time specified by timer into a rs_tm structure that provides year, month,
+ * hour, etc.  This value is stored at *local.
+ *
+ * This functions returns the same pointer that is passed as first argument.  If the
+ * local parameter is NULL, this function does nothing and returns NULL.
+ *
+ * Parameters:
+ *   local: Pointer to time structure where the local time will be stored.
+ *   timer: Input time as a number of seconds since January 1, 1970.
+ *
+ * Returns: Pointer to the output local time, i.e. the same value as the parameter local.
+ */
+extern rs_tm* __attribute__((overloadable))
+    rsLocaltime(rs_tm* local, const rs_time_t* timer);
+
+/*
+ * rsTime: Seconds since January 1, 1970
+ *
+ * Returns the number of seconds since the Epoch (00:00:00 UTC, January 1, 1970).
+ *
+ * If timer is non-NULL, the result is also stored in the memory pointed to by
+ * this variable.
+ *
+ * Parameters:
+ *   timer: Location to also store the returned calendar time.
+ *
+ * Returns: Seconds since the Epoch, -1 if there's an error.
+ */
+extern rs_time_t __attribute__((overloadable))
+    rsTime(rs_time_t* timer);
+
+/*
+ * rsUptimeMillis: System uptime in milliseconds
+ *
+ * Returns the current system clock (uptime) in milliseconds.
+ *
+ * Returns: Uptime in milliseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeMillis(void);
+
+/*
+ * rsUptimeNanos: System uptime in nanoseconds
+ *
+ * Returns the current system clock (uptime) in nanoseconds.
+ *
+ * The granularity of the values return by this call may be much larger than a nanosecond.
+ *
+ * Returns: Uptime in nanoseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeNanos(void);
+
+#endif // RENDERSCRIPT_RS_TIME_RSH

diff --git a/23.0.3/include/rs_value_types.rsh b/23.0.3/include/rs_value_types.rsh
new file mode 100644
index 0000000..faf397c
--- /dev/null
+++ b/23.0.3/include/rs_value_types.rsh

@@ -0,0 +1,543 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_value_types.rsh: Numerical Types
+ *
+ * Scalars:
+ *
+ * RenderScript supports the following scalar numerical types:
+ *
+ *                    8 bits           16 bits            32 bits          64 bits
+ * Integer:           char, int8_t     short, int16_t     int32_t          long, long long, int64_t
+ * Unsigned integer:  uchar, uint8_t   ushort, uint16_t   uint, uint32_t   ulong, uint64_t
+ * Floating point:                     half               float            double
+ *
+ *
+ * Vectors:
+ *
+ * RenderScript supports fixed size vectors of length 2, 3, and 4.
+ * Vectors are declared using the common type name followed by a 2, 3, or 4.
+ * E.g. float4, int3, double2, ulong4.
+ *
+ * To create vector literals, use the vector type followed by the values enclosed
+ * between parentheses, e.g. (float3)(1.0f, 2.0f, 3.0f).
+ *
+ * Entries of a vector can be accessed using different naming styles.
+ *
+ * Single entries can be accessed by following the variable name with a dot and:
+ * - The letters x, y, z, and w,
+ * - The letters r, g, b, and a,
+ * - The letter s or S, followed by a zero based index.
+ *
+ * For example, with int4 myVar; the following are equivalent:
+ *   myVar.x == myVar.r == myVar.s0 == myVar.S0
+ *   myVar.y == myVar.g == myVar.s1 == myVar.S1
+ *   myVar.z == myVar.b == myVar.s2 == myVar.S2
+ *   myVar.w == myVar.a == myVar.s3 == myVar.S3
+ *
+ * Multiple entries of a vector can be accessed at once by using an identifier that is
+ * the concatenation of multiple letters or indices.  The resulting vector has a size
+ * equal to the number of entries named.
+ *
+ * With the example above, the middle two entries can be accessed using
+ * myVar.yz, myVar.gb, myVar.s12, and myVar.S12.
+ *
+ * The entries don't have to be contiguous or in increasing order.  Entries can even be
+ * repeated, as long as we're not trying to assign to it.  You also can't mix the naming
+ * styles.
+ *
+ * Here are examples of what can or can't be done:
+ * float4 v4;
+ * float3 v3;
+ * float2 v2;
+ * v2 = v4.xx; // Valid
+ * v3 = v4.zxw; // Valid
+ * v3 = v4.bba; // Valid
+ * v3 = v4.s032; // Valid
+ * v3.s120 = v4.S233; // Valid
+ * v4.yz = v3.rg; // Valid
+ * v4.yzx = v3.rg; // Invalid: mismatched sizes
+ * v4.yzz = v3; // Invalid: z appears twice in an assignment
+ * v3 = v3.xas0; // Invalid: can't mix xyzw with rgba nor s0...
+ * v3 = v4.s034; // Invalid: the digit can only be 0, 1, 2, or 3
+ *
+ *
+ * Matrices and Quaternions:
+ *
+ * RenderScript supports fixed size square matrices of floats of size 2x2, 3x3, and 4x4.
+ * The types are named rs_matrix2x2, rs_matrix3x3, and rs_matrix4x4.  See
+ * Matrix Functions for the list of operations.
+ *
+ * Quaternions are also supported via rs_quaternion.  See Quaterion Functions for the list
+ * of operations.
+ */
+
+#ifndef RENDERSCRIPT_RS_VALUE_TYPES_RSH
+#define RENDERSCRIPT_RS_VALUE_TYPES_RSH
+
+/*
+ * half: 16 bit floating point value
+ *
+ * A 16 bit floating point value.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef __fp16 half;
+#endif
+
+/*
+ * half2: Two 16 bit floats
+ *
+ * Vector version of the half float type. Provides two half fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(2))) half2;
+#endif
+
+/*
+ * half3: Three 16 bit floats
+ *
+ * Vector version of the half float type. Provides three half fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(3))) half3;
+#endif
+
+/*
+ * half4: Four 16 bit floats
+ *
+ * Vector version of the half float type. Provides four half fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(4))) half4;
+#endif
+
+/*
+ * int8_t: 8 bit signed integer
+ *
+ * 8 bit signed integer type.
+ */
+typedef char int8_t;
+
+/*
+ * int16_t: 16 bit signed integer
+ *
+ * A 16 bit signed integer type.
+ */
+typedef short int16_t;
+
+/*
+ * int32_t: 32 bit signed integer
+ *
+ * A 32 bit signed integer type.
+ */
+typedef int int32_t;
+
+/*
+ * int64_t: 64 bit signed integer
+ *
+ * A 64 bit signed integer type.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+typedef long long int64_t;
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+typedef long int64_t;
+#endif
+
+/*
+ * uint8_t: 8 bit unsigned integer
+ *
+ * 8 bit unsigned integer type.
+ */
+typedef unsigned char uint8_t;
+
+/*
+ * uint16_t: 16 bit unsigned integer
+ *
+ * A 16 bit unsigned integer type.
+ */
+typedef unsigned short uint16_t;
+
+/*
+ * uint32_t: 32 bit unsigned integer
+ *
+ * A 32 bit unsigned integer type.
+ */
+typedef unsigned int uint32_t;
+
+/*
+ * uint64_t: 64 bit unsigned integer
+ *
+ * A 64 bit unsigned integer type.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+typedef unsigned long long uint64_t;
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+typedef unsigned long uint64_t;
+#endif
+
+/*
+ * uchar: 8 bit unsigned integer
+ *
+ * 8 bit unsigned integer type.
+ */
+typedef uint8_t uchar;
+
+/*
+ * ushort: 16 bit unsigned integer
+ *
+ * A 16 bit unsigned integer type.
+ */
+typedef uint16_t ushort;
+
+/*
+ * uint: 32 bit unsigned integer
+ *
+ * A 32 bit unsigned integer type.
+ */
+typedef uint32_t uint;
+
+/*
+ * ulong: 64 bit unsigned integer
+ *
+ * A 64 bit unsigned integer type.
+ */
+typedef uint64_t ulong;
+
+/*
+ * size_t: Unsigned size type
+ *
+ * Unsigned size type.  The number of bits depend on the compilation flags.
+ */
+#ifdef __LP64__
+typedef uint64_t size_t;
+#endif
+
+#ifndef __LP64__
+typedef uint32_t size_t;
+#endif
+
+/*
+ * ssize_t: Signed size type
+ *
+ * Signed size type.  The number of bits depend on the compilation flags.
+ */
+#ifdef __LP64__
+typedef int64_t ssize_t;
+#endif
+
+#ifndef __LP64__
+typedef int32_t ssize_t;
+#endif
+
+/*
+ * float2: Two 32 bit floats
+ *
+ * A vector of two floats.  These two floats are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ *
+ * A vector of two floats.  These two floats are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(2))) float2;
+
+/*
+ * float3: Three 32 bit floats
+ *
+ * A vector of three floats.  These three floats are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(3))) float3;
+
+/*
+ * float4: Four 32 bit floats
+ *
+ * A vector of four floats type.  These four floats are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(4))) float4;
+
+/*
+ * double2: Two 64 bit floats
+ *
+ * A vector of two doubles.  These two double fields packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(2))) double2;
+
+/*
+ * double3: Three 64 bit floats
+ *
+ * A vector of three doubles.  These three double fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(3))) double3;
+
+/*
+ * double4: Four 64 bit floats
+ *
+ * A vector of four doubles.  These four double fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(4))) double4;
+
+/*
+ * uchar2: Two 8 bit unsigned integers
+ *
+ * A vector of two uchars.  These two uchar fields packed into a single 16 bit field
+ * with a 16 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(2))) uchar2;
+
+/*
+ * uchar3: Three 8 bit unsigned integers
+ *
+ * A vector of three uchars.  These three uchar fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(3))) uchar3;
+
+/*
+ * uchar4: Four 8 bit unsigned integers
+ *
+ * A vector of four uchars.  These four uchar fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(4))) uchar4;
+
+/*
+ * ushort2: Two 16 bit unsigned integers
+ *
+ * A vector of two ushorts.  These two ushort fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(2))) ushort2;
+
+/*
+ * ushort3: Three 16 bit unsigned integers
+ *
+ * A vector of three ushorts.  These three ushort fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(3))) ushort3;
+
+/*
+ * ushort4: Four 16 bit unsigned integers
+ *
+ * A vector of four ushorts.  These four ushort fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(4))) ushort4;
+
+/*
+ * uint2: Two 32 bit unsigned integers
+ *
+ * A vector of two uints.  These two uints are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(2))) uint2;
+
+/*
+ * uint3: Three 32 bit unsigned integers
+ *
+ * A vector of three uints.  These three uints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(3))) uint3;
+
+/*
+ * uint4: Four 32 bit unsigned integers
+ *
+ * A vector of four uints.  These four uints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(4))) uint4;
+
+/*
+ * ulong2: Two 64 bit unsigned integers
+ *
+ * A vector of two ulongs.  These two ulongs are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(2))) ulong2;
+
+/*
+ * ulong3: Three 64 bit unsigned integers
+ *
+ * A vector of three ulongs.  These three ulong fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(3))) ulong3;
+
+/*
+ * ulong4: Four 64 bit unsigned integers
+ *
+ * A vector of four ulongs.  These four ulong fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(4))) ulong4;
+
+/*
+ * char2: Two 8 bit signed integers
+ *
+ * A vector of two chars.  These two chars are packed into a single 16 bit field
+ * with a 16 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(2))) char2;
+
+/*
+ * char3: Three 8 bit signed integers
+ *
+ * A vector of three chars.  These three chars are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(3))) char3;
+
+/*
+ * char4: Four 8 bit signed integers
+ *
+ * A vector of four chars.  These four chars are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(4))) char4;
+
+/*
+ * short2: Two 16 bit signed integers
+ *
+ * A vector of two shorts.  These two shorts are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(2))) short2;
+
+/*
+ * short3: Three 16 bit signed integers
+ *
+ * A vector of three shorts.  These three short fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(3))) short3;
+
+/*
+ * short4: Four 16 bit signed integers
+ *
+ * A vector of four shorts.  These four short fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(4))) short4;
+
+/*
+ * int2: Two 32 bit signed integers
+ *
+ * A vector of two ints.  These two ints are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(2))) int2;
+
+/*
+ * int3: Three 32 bit signed integers
+ *
+ * A vector of three ints.  These three ints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(3))) int3;
+
+/*
+ * int4: Four 32 bit signed integers
+ *
+ * A vector of four ints.  These two fours are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(4))) int4;
+
+/*
+ * long2: Two 64 bit signed integers
+ *
+ * A vector of two longs.  These two longs are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(2))) long2;
+
+/*
+ * long3: Three 64 bit signed integers
+ *
+ * A vector of three longs.  These three longs are packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(3))) long3;
+
+/*
+ * long4: Four 64 bit signed integers
+ *
+ * A vector of four longs.  These four longs are packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(4))) long4;
+
+/*
+ * rs_matrix2x2: 2x2 matrix of 32 bit floats
+ *
+ * A square 2x2 matrix of floats.  The entries are stored in the array at the
+ * location [row*2 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[4];
+} rs_matrix2x2;
+
+/*
+ * rs_matrix3x3: 3x3 matrix of 32 bit floats
+ *
+ * A square 3x3 matrix of floats.  The entries are stored in the array at the
+ * location [row*3 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[9];
+} rs_matrix3x3;
+
+/*
+ * rs_matrix4x4: 4x4 matrix of 32 bit floats
+ *
+ * A square 4x4 matrix of floats.  The entries are stored in the array at the
+ * location [row*4 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[16];
+} rs_matrix4x4;
+
+/*
+ * rs_quaternion: Quaternion
+ *
+ * A square 4x4 matrix of floats that represents a quaternion.
+ *
+ * See Quaternion Functions.
+ */
+typedef float4 rs_quaternion;
+
+#endif // RENDERSCRIPT_RS_VALUE_TYPES_RSH

diff --git a/23.0.3/include/rs_vector_math.rsh b/23.0.3/include/rs_vector_math.rsh
new file mode 100644
index 0000000..a2757e9
--- /dev/null
+++ b/23.0.3/include/rs_vector_math.rsh

@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_vector_math.rsh: Vector Math Functions
+ *
+ * These functions interpret the input arguments as representation of vectors in
+ * n-dimensional space.
+ *
+ * The precision of the mathematical operations on 32 bit floats is affected by the pragmas
+ * rs_fp_relaxed and rs_fp_full.  See Mathematical Constants and Functions for details.
+ *
+ * Different precision/speed tradeoffs can be achieved by using variants of the common math
+ * functions.  Functions with a name starting with
+ * - native_: May have custom hardware implementations with weaker precision.  Additionally,
+ *   subnormal values may be flushed to zero, rounding towards zero may be used, and NaN and
+ *   infinity input may not be handled correctly.
+ * - fast_: May perform internal computations using 16 bit floats.  Additionally, subnormal
+ *   values may be flushed to zero, and rounding towards zero may be used.
+ *
+ */
+
+#ifndef RENDERSCRIPT_RS_VECTOR_MATH_RSH
+#define RENDERSCRIPT_RS_VECTOR_MATH_RSH
+
+/*
+ * cross: Cross product of two vectors
+ *
+ * Computes the cross product of two vectors.
+ */
+extern float3 __attribute__((const, overloadable))
+    cross(float3 left_vector, float3 right_vector);
+
+extern float4 __attribute__((const, overloadable))
+    cross(float4 left_vector, float4 right_vector);
+
+/*
+ * distance: Distance between two points
+ *
+ * Compute the distance between two points.
+ *
+ * See also fast_distance(), native_distance().
+ */
+extern float __attribute__((const, overloadable))
+    distance(float left_vector, float right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float2 left_vector, float2 right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float3 left_vector, float3 right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float4 left_vector, float4 right_vector);
+
+/*
+ * dot: Dot product of two vectors
+ *
+ * Computes the dot product of two vectors.
+ */
+extern float __attribute__((const, overloadable))
+    dot(float left_vector, float right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float2 left_vector, float2 right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float3 left_vector, float3 right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float4 left_vector, float4 right_vector);
+
+/*
+ * fast_distance: Approximate distance between two points
+ *
+ * Computes the approximate distance between two points.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also distance(), native_distance().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float4 left_vector, float4 right_vector);
+#endif
+
+/*
+ * fast_length: Approximate length of a vector
+ *
+ * Computes the approximate length of a vector.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also length(), native_length().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float4 v);
+#endif
+
+/*
+ * fast_normalize: Approximate normalized vector
+ *
+ * Approximately normalizes a vector.
+ *
+ * For vectors of size 1, returns -1.f for negative values, 0.f for null values, and 1.f for
+ * positive values.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also normalize(), native_normalize().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    fast_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    fast_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    fast_normalize(float4 v);
+#endif
+
+/*
+ * length: Length of a vector
+ *
+ * Computes the length of a vector.
+ *
+ * See also fast_length(), native_length().
+ */
+extern float __attribute__((const, overloadable))
+    length(float v);
+
+extern float __attribute__((const, overloadable))
+    length(float2 v);
+
+extern float __attribute__((const, overloadable))
+    length(float3 v);
+
+extern float __attribute__((const, overloadable))
+    length(float4 v);
+
+/*
+ * native_distance: Approximate distance between two points
+ *
+ * Computes the approximate distance between two points.
+ *
+ * See also distance(), fast_distance().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float4 left_vector, float4 right_vector);
+#endif
+
+/*
+ * native_length: Approximate length of a vector
+ *
+ * Compute the approximate length of a vector.
+ *
+ * See also length(), fast_length().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float4 v);
+#endif
+
+/*
+ * native_normalize: Approximately normalize a vector
+ *
+ * Approximately normalizes a vector.
+ *
+ * See also normalize(), fast_normalize().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_normalize(float4 v);
+#endif
+
+/*
+ * normalize: Normalize a vector
+ *
+ * Normalize a vector.
+ *
+ * For vectors of size 1, returns -1.f for negative values, 0.f for null values, and 1.f for
+ * positive values.
+ *
+ * See also fast_normalize(), native_normalize().
+ */
+extern float __attribute__((const, overloadable))
+    normalize(float v);
+
+extern float2 __attribute__((const, overloadable))
+    normalize(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    normalize(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    normalize(float4 v);
+
+#endif // RENDERSCRIPT_RS_VECTOR_MATH_RSH

diff --git a/23.0.3/lib/libLLVM.so b/23.0.3/lib/libLLVM.so
new file mode 100755
index 0000000..c689caf
--- /dev/null
+++ b/23.0.3/lib/libLLVM.so
Binary files differ

diff --git a/23.0.3/lib/libbcc.so b/23.0.3/lib/libbcc.so
new file mode 100755
index 0000000..5474a58
--- /dev/null
+++ b/23.0.3/lib/libbcc.so
Binary files differ

diff --git a/23.0.3/lib/libbcinfo.so b/23.0.3/lib/libbcinfo.so
new file mode 100755
index 0000000..f99afcd
--- /dev/null
+++ b/23.0.3/lib/libbcinfo.so
Binary files differ

diff --git a/23.0.3/lib/libc++.so b/23.0.3/lib/libc++.so
new file mode 100755
index 0000000..565b4d9
--- /dev/null
+++ b/23.0.3/lib/libc++.so
Binary files differ

diff --git a/23.0.3/lib/libclang.so b/23.0.3/lib/libclang.so
new file mode 100755
index 0000000..a5990d1
--- /dev/null
+++ b/23.0.3/lib/libclang.so
Binary files differ

diff --git a/24.0.3/bin/llvm-rs-cc b/24.0.3/bin/llvm-rs-cc
new file mode 100755
index 0000000..ccd141a
--- /dev/null
+++ b/24.0.3/bin/llvm-rs-cc
Binary files differ

diff --git a/24.0.3/clang-include/CMakeLists.txt b/24.0.3/clang-include/CMakeLists.txt
new file mode 100644
index 0000000..9393f69
--- /dev/null
+++ b/24.0.3/clang-include/CMakeLists.txt

@@ -0,0 +1,115 @@
+set(files
+  adxintrin.h
+  altivec.h
+  ammintrin.h
+  arm_acle.h
+  avx2intrin.h
+  avx512bwintrin.h
+  avx512cdintrin.h
+  avx512erintrin.h
+  avx512fintrin.h
+  avx512vlbwintrin.h
+  avx512vlintrin.h
+  avx512dqintrin.h
+  avx512vldqintrin.h
+  avxintrin.h
+  bmi2intrin.h
+  bmiintrin.h
+  __clang_cuda_runtime_wrapper.h
+  cpuid.h
+  cuda_builtin_vars.h
+  emmintrin.h
+  f16cintrin.h
+  float.h
+  fma4intrin.h
+  fmaintrin.h
+  fxsrintrin.h
+  htmintrin.h
+  htmxlintrin.h
+  ia32intrin.h
+  immintrin.h
+  Intrin.h
+  inttypes.h
+  iso646.h
+  limits.h
+  lzcntintrin.h
+  mm3dnow.h
+  mmintrin.h
+  mm_malloc.h
+  module.modulemap
+  nmmintrin.h
+  pmmintrin.h
+  popcntintrin.h
+  prfchwintrin.h
+  rdseedintrin.h
+  rtmintrin.h
+  s390intrin.h
+  shaintrin.h
+  smmintrin.h
+  stdalign.h
+  stdarg.h
+  stdatomic.h
+  stdbool.h
+  stddef.h
+  __stddef_max_align_t.h
+  stdint.h
+  stdnoreturn.h
+  tbmintrin.h
+  tgmath.h
+  tmmintrin.h
+  unwind.h
+  vadefs.h
+  varargs.h
+  vecintrin.h
+  __wmmintrin_aes.h
+  wmmintrin.h
+  __wmmintrin_pclmul.h
+  x86intrin.h
+  xmmintrin.h
+  xopintrin.h
+  xsaveintrin.h
+  xsaveoptintrin.h
+  xsavecintrin.h
+  xsavesintrin.h
+  xtestintrin.h
+  )
+
+set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
+
+# Generate arm_neon.h
+clang_tablegen(arm_neon.h -gen-arm-neon
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+
+set(out_files)
+foreach( f ${files} )
+  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
+  set( dst ${output_dir}/${f} )
+  add_custom_command(OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+    COMMENT "Copying clang's ${f}...")
+  list(APPEND out_files ${dst})
+endforeach( f )
+
+add_custom_command(OUTPUT ${output_dir}/arm_neon.h 
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
+  COMMENT "Copying clang's arm_neon.h...")
+list(APPEND out_files ${output_dir}/arm_neon.h)
+
+add_custom_target(clang-headers ALL DEPENDS ${out_files})
+set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
+
+install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMPONENT clang-headers
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+
+if (NOT CMAKE_CONFIGURATION_TYPES) # don't add this for IDE's.
+  add_custom_target(install-clang-headers
+    DEPENDS
+    COMMAND "${CMAKE_COMMAND}"
+            -DCMAKE_INSTALL_COMPONENT=clang-headers
+            -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+endif()

diff --git a/24.0.3/clang-include/Intrin.h b/24.0.3/clang-include/Intrin.h
new file mode 100644
index 0000000..6c1d0d1
--- /dev/null
+++ b/24.0.3/clang-include/Intrin.h

@@ -0,0 +1,958 @@
+/* ===-------- Intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <Intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+float _m_to_float(__m64);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+void __debugbreak(void);
+__int64 __emul(int, int);
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+static __inline__
+unsigned int __popcnt(unsigned int);
+static __inline__
+unsigned short __popcnt16(unsigned short);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned long __readfsdword(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned long __cdecl _byteswap_ulong(unsigned long);
+unsigned short __cdecl _byteswap_ushort(unsigned short);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+static __inline__
+long _InterlockedAnd(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedAnd16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedAnd8(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+static __inline__
+long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
+                                         long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+static __inline__
+short _InterlockedCompareExchange16(short volatile *_Destination,
+                                    short _Exchange, short _Comparand);
+static __inline__
+__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
+                                      __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+static __inline__
+char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
+                                  char _Comparand);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+static __inline__
+long __cdecl _InterlockedDecrement(long volatile *_Addend);
+static __inline__
+short _InterlockedDecrement16(short volatile *_Addend);
+long _InterlockedExchange(long volatile *_Target, long _Value);
+static __inline__
+short _InterlockedExchange16(short volatile *_Target, short _Value);
+static __inline__
+char _InterlockedExchange8(char volatile *_Target, char _Value);
+static __inline__
+long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+static __inline__
+short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+static __inline__
+char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
+static __inline__
+long __cdecl _InterlockedIncrement(long volatile *_Addend);
+static __inline__
+short _InterlockedIncrement16(short volatile *_Addend);
+static __inline__
+long _InterlockedOr(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedOr16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedOr8(char volatile *_Value, char _Mask);
+static __inline__
+long _InterlockedXor(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedXor16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedXor8(char volatile *_Value, char _Mask);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__
+unsigned long __cdecl _lrotl(unsigned long, int);
+static __inline__
+unsigned long __cdecl _lrotr(unsigned long, int);
+static __inline__
+static __inline__
+void _ReadBarrier(void);
+static __inline__
+void _ReadWriteBarrier(void);
+static __inline__
+void *_ReturnAddress(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+static __inline__
+unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
+static __inline__
+unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__
+void _WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __popcnt64(unsigned __int64);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
+                                         void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+static __inline__
+__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
+                __int64 *_HighProduct);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmpex(jmp_buf);
+#endif
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+/*
+ * Multiply two 64-bit integers and obtain a 64-bit result.
+ * The low-half is returned directly and the high half is in an out parameter.
+ */
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
+         unsigned __int64 *_HighProduct) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  *_HighProduct = _FullProduct >> 64;
+  return _FullProduct;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  return _FullProduct >> 64;
+}
+
+#endif /* __x86_64__ */
+
+/*----------------------------------------------------------------------------*\
+|* Multiplication
+\*----------------------------------------------------------------------------*/
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+__emul(int __in1, int __in2) {
+  return (__int64)__in1 * (__int64)__in2;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__emulu(unsigned int __in1, unsigned int __in2) {
+  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Twiddling
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_rotl8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_rotr8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+_rotl16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+_rotr16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rotl(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rotr(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+_lrotl(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+_lrotr(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_rotl64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_rotr64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 31 - __builtin_clzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__popcnt16(unsigned short _Value) {
+  return __builtin_popcount((int)_Value);
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__popcnt(unsigned int _Value) {
+  return __builtin_popcount(_Value);
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzll(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 63 - __builtin_clzll(_Mask);
+  return 1;
+}
+static __inline__
+unsigned __int64 __DEFAULT_FN_ATTRS
+__popcnt64(unsigned __int64 _Value) {
+  return __builtin_popcountll(_Value);
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Sub
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8(char volatile *_Value, char _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16(short volatile *_Value, short _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd(long volatile *_Value, long _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8(char volatile *_Value, char _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16(short volatile *_Value, short _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr(long volatile *_Value, long _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8(char volatile *_Value, char _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16(short volatile *_Value, short _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor(long volatile *_Value, long _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+/*----------------------------------------------------------------------------*\
+|* Barriers
+\*----------------------------------------------------------------------------*/
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__faststorefence(void) {
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
+  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+static __inline__ void * __DEFAULT_FN_ATTRS
+_AddressOfReturnAddress(void) {
+  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
+}
+static __inline__ void * __DEFAULT_FN_ATTRS
+_ReturnAddress(void) {
+  return __builtin_return_address(0);
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */

diff --git a/24.0.3/clang-include/LICENSE.TXT b/24.0.3/clang-include/LICENSE.TXT
new file mode 100644
index 0000000..fc4afae
--- /dev/null
+++ b/24.0.3/clang-include/LICENSE.TXT

@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2015 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+

diff --git a/24.0.3/clang-include/adxintrin.h b/24.0.3/clang-include/adxintrin.h
new file mode 100644
index 0000000..ee34728
--- /dev/null
+++ b/24.0.3/clang-include/adxintrin.h

@@ -0,0 +1,86 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Intrinsics that are available only if __ADX__ defined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+               unsigned int *__p)
+{
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */

diff --git a/24.0.3/clang-include/altivec.h b/24.0.3/clang-include/altivec.h
new file mode 100644
index 0000000..c39156e
--- /dev/null
+++ b/24.0.3/clang-include/altivec.h

@@ -0,0 +1,13919 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ 0
+#define __CR6_EQ_REV 1
+#define __CR6_LT 2
+#define __CR6_LT_REV 3
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+static vector signed char __ATTRS_o_ai vec_perm(vector signed char __a,
+                                                vector signed char __b,
+                                                vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai vec_perm(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector unsigned char __c);
+
+static vector bool char __ATTRS_o_ai vec_perm(vector bool char __a,
+                                              vector bool char __b,
+                                              vector unsigned char __c);
+
+static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                          vector signed short __b,
+                                          vector unsigned char __c);
+
+static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector unsigned char __c);
+
+static vector bool short __ATTRS_o_ai vec_perm(vector bool short __a,
+                                               vector bool short __b,
+                                               vector unsigned char __c);
+
+static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
+                                          vector unsigned char __c);
+
+static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                        vector signed int __b,
+                                        vector unsigned char __c);
+
+static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector unsigned char __c);
+
+static vector bool int __ATTRS_o_ai vec_perm(vector bool int __a,
+                                             vector bool int __b,
+                                             vector unsigned char __c);
+
+static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b,
+                                          vector unsigned char __c);
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
+                                              vector signed long long __b,
+                                              vector unsigned char __c);
+
+static vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c);
+
+static vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c);
+
+static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
+                                           vector unsigned char __c);
+#endif
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
+                                                 vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi vec_abs
+#define __builtin_altivec_abs_v4si vec_abs
+
+static vector signed char __ATTRS_o_ai vec_abs(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_abs(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static vector signed int __ATTRS_o_ai vec_abs(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed long long __ATTRS_o_ai
+vec_abs(vector signed long long __a) {
+  return __builtin_altivec_vmaxsd(__a, -__a);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_abs(vector float __a) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector double __ATTRS_o_ai vec_abs(vector double __a) {
+  vector unsigned long long __res = { 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF };
+  __res &= (vector unsigned int)__a;
+  return (vector double)__res;
+}
+#endif
+
+/* vec_abss */
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi vec_abss
+#define __builtin_altivec_abss_v4si vec_abss
+
+static vector signed char __ATTRS_o_ai vec_abss(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(
+      __a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static vector signed short __ATTRS_o_ai vec_abss(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(
+      __a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static vector signed int __ATTRS_o_ai vec_abss(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(
+      __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_add */
+
+static vector signed char __ATTRS_o_ai vec_add(vector signed char __a,
+                                               vector signed char __b) {
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_add(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_add(vector signed char __a,
+                                               vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_add(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_add(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_add(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai vec_add(vector short __a, vector short __b) {
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_add(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_add(vector short __a,
+                                         vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_add(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_add(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_add(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai vec_add(vector int __a, vector int __b) {
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_add(vector bool int __a, vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_add(vector int __a, vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_add(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_add(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_add(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed long long __ATTRS_o_ai
+vec_add(vector signed long long __a, vector signed long long __b) {
+  return __a + __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_add(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a + __b;
+}
+
+static vector signed __int128 __ATTRS_o_ai vec_add(vector signed __int128 __a,
+                                                   vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+static vector float __ATTRS_o_ai vec_add(vector float __a, vector float __b) {
+  return __a + __b;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_add(vector double __a, vector double __b) {
+  return __a + __b;
+}
+#endif // __VSX__
+
+/* vec_adde */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_adde(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+#endif
+
+/* vec_addec */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_addec(vector signed __int128 __a, vector signed __int128 __b,
+          vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+          vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static vector signed char __ATTRS_o_ai vec_vaddubm(vector signed char __a,
+                                                   vector signed char __b) {
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddubm(vector bool char __a,
+                                                   vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddubm(vector signed char __a,
+                                                   vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                             vector short __b) {
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_vadduhm(vector bool short __a,
+                                             vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                             vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vadduhm(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static vector int __ATTRS_o_ai vec_vadduwm(vector int __a, vector int __b) {
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                           vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                           vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp vec_vaddfp
+
+static vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b) {
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static vector signed int __ATTRS_o_ai vec_addc(vector signed int __a,
+                                               vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vaddcuw((vector unsigned int)__a,
+                                                      (vector unsigned int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_addc(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector signed __int128)__builtin_altivec_vaddcuq(
+    (vector unsigned __int128)__a,
+    (vector unsigned __int128)__b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vaddcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static vector signed char __ATTRS_o_ai vec_adds(vector signed char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_adds(vector bool char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_adds(vector signed char __a,
+                                                vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_adds(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_adds(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_adds(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_adds(vector short __a, vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                          vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                          vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_adds(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_adds(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_adds(vector int __a, vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_adds(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_adds(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_adds(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_adds(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_adds(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static vector signed char __ATTRS_o_ai vec_vaddsbs(vector signed char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddsbs(vector bool char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddsbs(vector signed char __a,
+                                                   vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vaddshs(vector bool short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                             vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vadduhs(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static vector int __ATTRS_o_ai vec_vaddsws(vector int __a, vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vaddsws(vector bool int __a,
+                                           vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                           vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static vector unsigned int __ATTRS_o_ai vec_vadduws(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduws(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduws(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vadduqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+
+/* vec_vaddeuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+/* vec_vaddcuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+/* vec_vaddecuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static vector signed char __ATTRS_o_ai vec_and(vector signed char __a,
+                                               vector signed char __b) {
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_and(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_and(vector signed char __a,
+                                               vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_and(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_and(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_and(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_and(vector bool char __a,
+                                             vector bool char __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_and(vector short __a, vector short __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_and(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_and(vector short __a,
+                                         vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_and(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_and(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_and(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_and(vector bool short __a,
+                                              vector bool short __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_and(vector int __a, vector int __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_and(vector bool int __a, vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_and(vector int __a, vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_and(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_and(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_and(vector bool int __a,
+                                            vector bool int __b) {
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai vec_and(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_and(vector bool int __a,
+                                         vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_and(vector float __a,
+                                         vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_and(vector bool long long __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_and(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_and(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_and(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_and(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_vand */
+
+static vector signed char __ATTRS_o_ai vec_vand(vector signed char __a,
+                                                vector signed char __b) {
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vand(vector signed char __a,
+                                                vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vand(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vand(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                              vector bool char __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_vand(vector short __a, vector short __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                          vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                          vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vand(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vand(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                               vector bool short __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_vand(vector int __a, vector int __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_vand(vector bool int __a, vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_vand(vector int __a, vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vand(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vand(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                             vector bool int __b) {
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai vec_vand(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vand(vector bool int __a,
+                                          vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                          vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vand(vector bool long long __a,
+                                                   vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static vector signed char __ATTRS_o_ai vec_andc(vector signed char __a,
+                                                vector signed char __b) {
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_andc(vector signed char __a,
+                                                vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_andc(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_andc(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                              vector bool char __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_andc(vector short __a, vector short __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                          vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                          vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_andc(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_andc(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                               vector bool short __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_andc(vector int __a, vector int __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_andc(vector bool int __a, vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_andc(vector int __a, vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_andc(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_andc(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                             vector bool int __b) {
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai vec_andc(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_andc(vector bool int __a,
+                                          vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                          vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai
+vec_andc(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_andc(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_andc(vector bool long long __a,
+                                                   vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_vandc */
+
+static vector signed char __ATTRS_o_ai vec_vandc(vector signed char __a,
+                                                 vector signed char __b) {
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vandc(vector bool char __a,
+                                                 vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vandc(vector signed char __a,
+                                                 vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vandc(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vandc(vector bool char __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vandc(vector unsigned char __a,
+                                                   vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vandc(vector bool char __a,
+                                               vector bool char __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_vandc(vector short __a, vector short __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                           vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                           vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vandc(vector unsigned short __a,
+                                                    vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                    vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vandc(vector unsigned short __a,
+                                                    vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                vector bool short __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_vandc(vector int __a, vector int __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_vandc(vector bool int __a, vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_vandc(vector int __a, vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vandc(vector unsigned int __a,
+                                                  vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vandc(vector unsigned int __a,
+                                                  vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                              vector bool int __b) {
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai vec_vandc(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                           vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                           vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vandc(vector bool long long __a,
+                                                    vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_avg */
+
+static vector signed char __ATTRS_o_ai vec_avg(vector signed char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_avg(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_avg(vector short __a, vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_avg(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_avg(vector int __a, vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_avg(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static vector float __ATTRS_o_ai vec_ceil(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspip(__a);
+#else
+  return __builtin_altivec_vrfip(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_ceil(vector double __a) {
+  return __builtin_vsx_xvrdpip(__a);
+}
+#endif
+
+/* vec_vrfip */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a) {
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_cmpeq(vector unsigned char __a,
+                                               vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
+                                                vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpeq(vector unsigned short __a,
+                                                vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpeq(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+#endif
+
+static vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpeqsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpeqdp(__a, __b);
+}
+#endif
+
+
+/* vec_cmpgt */
+
+static vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_cmpgt(vector unsigned char __a,
+                                               vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpgt(vector short __a,
+                                                vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpgt(vector unsigned short __a,
+                                                vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpgt(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpgt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
+}
+#endif
+
+static vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgtsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgtdp(__a, __b);
+}
+#endif
+
+/* vec_cmpge */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpge (vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpge (vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpge (vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpge (vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge (vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge (vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgesp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgedp(__a, __b);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+#endif
+
+/* vec_vcmpgefp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static vector bool char __ATTRS_o_ai
+vec_cmple (vector signed char __a, vector signed char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmple (vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmple (vector signed short __a, vector signed short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmple (vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple (vector signed int __a, vector signed int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple (vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple(vector float __a, vector float __b) {
+  return vec_cmpge(__b, __a);
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector double __a, vector double __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+/* vec_cmplt */
+
+static vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a,
+                                               vector signed char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai vec_cmplt(vector unsigned char __a,
+                                               vector unsigned char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
+                                                vector short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmplt(vector unsigned short __a,
+                                                vector unsigned short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmplt(vector int __a, vector int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmplt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
+                                              vector float __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector double __a, vector double __b) {
+  return vec_cmpgt(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+/* vec_cntlz */
+
+static vector signed char __ATTRS_o_ai vec_cntlz(vector signed char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static vector unsigned char __ATTRS_o_ai vec_cntlz(vector unsigned char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static vector signed short __ATTRS_o_ai vec_cntlz(vector signed short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static vector unsigned short __ATTRS_o_ai vec_cntlz(vector unsigned short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static vector signed int __ATTRS_o_ai vec_cntlz(vector signed int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static vector unsigned int __ATTRS_o_ai vec_cntlz(vector unsigned int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static vector signed long long __ATTRS_o_ai
+vec_cntlz(vector signed long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+#endif
+
+/* vec_cpsgn */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai vec_cpsgn(vector float __a, vector float __b) {
+  return __builtin_vsx_xvcpsgnsp(__a, __b);
+}
+
+static vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
+                                            vector double __b) {
+  return __builtin_vsx_xvcpsgndp(__a, __b);
+}
+#endif
+
+/* vec_ctf */
+
+static vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a, int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_vcfsx */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static vector int __attribute__((__always_inline__))
+vec_cts(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_vctsxs */
+
+static vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_ctu(vector float __a, int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_vctuxs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_double */
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_double (vector signed long long __a) {
+  vector double __ret = { __a[0], __a[1] };
+  return __ret;
+}
+
+static vector double __ATTRS_o_ai vec_double (vector unsigned long long __a) {
+  vector double __ret = { __a[0], __a[1] };
+  return __ret;
+}
+#endif
+
+/* vec_div */
+
+/* Integer vector divides (vectors are scalarized, elements divided
+   and the vectors reassembled).
+*/
+static vector signed char __ATTRS_o_ai vec_div(vector signed char __a,
+                                               vector signed char __b) {
+  return __a / __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_div(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a / __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_div(vector signed short __a,
+                                                vector signed short __b) {
+  return __a / __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_div(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a / __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_div(vector signed int __a,
+                                              vector signed int __b) {
+  return __a / __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_div(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a / __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_div(vector signed long long __a, vector signed long long __b) {
+  return __a / __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_div(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a / __b;
+}
+
+static vector float __ATTRS_o_ai vec_div(vector float __a, vector float __b) {
+  return __a / __b;
+}
+
+static vector double __ATTRS_o_ai vec_div(vector double __a,
+                                          vector double __b) {
+  return __a / __b;
+}
+#endif
+
+/* vec_dss */
+
+static void __attribute__((__always_inline__)) vec_dss(int __a) {
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static void __attribute__((__always_inline__)) vec_dssall(void) {
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+
+static void __attribute__((__always_inline__))
+vec_dst(const void *__a, int __b, int __c) {
+  __builtin_altivec_dst(__a, __b, __c);
+}
+
+/* vec_dstst */
+
+static void __attribute__((__always_inline__))
+vec_dstst(const void *__a, int __b, int __c) {
+  __builtin_altivec_dstst(__a, __b, __c);
+}
+
+/* vec_dststt */
+
+static void __attribute__((__always_inline__))
+vec_dststt(const void *__a, int __b, int __c) {
+  __builtin_altivec_dststt(__a, __b, __c);
+}
+
+/* vec_dstt */
+
+static void __attribute__((__always_inline__))
+vec_dstt(const void *__a, int __b, int __c) {
+  __builtin_altivec_dstt(__a, __b, __c);
+}
+
+/* vec_eqv */
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_eqv(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                  (vector unsigned int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_eqv(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                    (vector unsigned int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
+                                             vector bool char __b) {
+  return (vector bool char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                (vector unsigned int)__b);
+}
+
+static vector signed short __ATTRS_o_ai vec_eqv(vector signed short __a,
+                                                vector signed short __b) {
+  return (vector signed short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                   (vector unsigned int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_eqv(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_eqv(vector bool short __a,
+                                              vector bool short __b) {
+  return (vector bool short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector signed int __ATTRS_o_ai vec_eqv(vector signed int __a,
+                                              vector signed int __b) {
+  return (vector signed int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_eqv(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_vsx_xxleqv(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
+                                            vector bool int __b) {
+  return (vector bool int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_eqv(vector float __a, vector float __b) {
+  return (vector float)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                            (vector unsigned int)__b);
+}
+
+static vector double __ATTRS_o_ai vec_eqv(vector double __a,
+                                          vector double __b) {
+  return (vector double)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                             (vector unsigned int)__b);
+}
+#endif
+
+/* vec_expte */
+
+static vector float __attribute__((__always_inline__))
+vec_expte(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static vector float __ATTRS_o_ai vec_floor(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspim(__a);
+#else
+  return __builtin_altivec_vrfim(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_floor(vector double __a) {
+  return __builtin_vsx_xvrdpim(__a);
+}
+#endif
+
+/* vec_vrfim */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a) {
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static vector signed char __ATTRS_o_ai vec_ld(int __a,
+                                              const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_ld(int __a,
+                                                const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_ld(int __a,
+                                            const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ld(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ld(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_ld(int __a,
+                                                 const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_ld(int __a,
+                                             const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_ld(int __a, const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ld(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ld(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_ld(int __a,
+                                               const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_ld(int __a,
+                                               const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_ld(int __a,
+                                           const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ld(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ld(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static vector signed char __ATTRS_o_ai vec_lvx(int __a,
+                                               const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_lvx(int __a,
+                                               const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvx(int __a,
+                                                 const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_lvx(int __a,
+                                             const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvx(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvx(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvx(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_lvx(int __a,
+                                              const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_lvx(int __a, const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvx(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvx(int __a,
+                                                const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_lvx(int __a,
+                                            const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvx(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvx(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static vector signed char __ATTRS_o_ai vec_lde(int __a,
+                                               const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lde(int __a,
+                                                 const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lde(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lde(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lde(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lde(int __a,
+                                                const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lde(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static vector signed char __ATTRS_o_ai vec_lvebx(int __a,
+                                                 const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvebx(int __a,
+                                                   const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static vector short __ATTRS_o_ai vec_lvehx(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvehx(int __a,
+                                                    const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static vector int __ATTRS_o_ai vec_lvewx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvewx(int __a,
+                                                  const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvewx(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static vector signed char __ATTRS_o_ai vec_ldl(int __a,
+                                               const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_ldl(int __a,
+                                               const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_ldl(int __a,
+                                                 const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_ldl(int __a,
+                                             const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ldl(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ldl(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_ldl(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_ldl(int __a,
+                                              const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_ldl(int __a, const vector pixel *__b) {
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ldl(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ldl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_ldl(int __a,
+                                                const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_ldl(int __a,
+                                            const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ldl(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ldl(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static vector signed char __ATTRS_o_ai vec_lvxl(int __a,
+                                                const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_lvxl(int __a,
+                                                const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvxl(int __a,
+                                                  const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_lvxl(int __a,
+                                              const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvxl(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvxl(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvxl(int __a,
+                                                   const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_lvxl(int __a,
+                                               const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_lvxl(int __a, const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvxl(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvxl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvxl(int __a,
+                                                 const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_lvxl(int __a,
+                                             const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvxl(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvxl(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static vector float __attribute__((__always_inline__))
+vec_loge(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+/* vec_lvsr */
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+/* vec_madd */
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector signed short, vector signed short);
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector unsigned short, vector unsigned short);
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector signed short, vector signed short);
+static vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector unsigned short, vector unsigned short);
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector signed short __b,
+         vector signed short __c) {
+  return  vec_mladd(__a, __b, __c);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector signed short __b,
+         vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_madd(vector float __a, vector float __b, vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaddasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vmaddfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a, vector signed short __b,
+              vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_msub */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_msub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_vsx_xvmsubasp(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_max */
+
+static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_max(vector bool char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
+                                               vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_max(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_max(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_max(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_max(vector short __a, vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_max(vector bool short __a,
+                                         vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_max(vector short __a,
+                                         vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_max(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_max(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_max(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_max(vector int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_max(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_max(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_max(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_max(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_max(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd((vector signed long long)__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai vec_max(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __builtin_altivec_vmaxsd(__a, (vector signed long long)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud((vector unsigned long long)__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vmaxud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_max(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_max(vector double __a,
+                                          vector double __b) {
+  return __builtin_vsx_xvmaxdp(__a, __b);
+}
+#endif
+
+/* vec_vmaxsb */
+
+static vector signed char __ATTRS_o_ai vec_vmaxsb(vector signed char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vmaxsb(vector bool char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vmaxsb(vector signed char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector bool char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector unsigned char __a,
+                                                    vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vmaxsh(vector bool short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                            vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vmaxuh(vector unsigned short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static vector int __ATTRS_o_ai vec_vmaxsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vmaxsw(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vmaxsw(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector bool int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector unsigned int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+/* vec_mergeh */
+
+static vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mergeh(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai vec_mergeh(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector short __ATTRS_o_ai vec_mergeh(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai vec_mergeh(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai vec_mergeh(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector int __ATTRS_o_ai vec_mergeh(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mergeh(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai vec_mergeh(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai vec_mergeh(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                 (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                        0x04, 0x05, 0x06, 0x07,
+                                        0x10, 0x11, 0x12, 0x13,
+                                        0x14, 0x15, 0x16, 0x17));
+}
+
+static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                             vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                             vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static vector double __ATTRS_o_ai vec_mergeh(vector bool long long __a,
+                                             vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+#endif
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static vector signed char __ATTRS_o_ai vec_vmrghb(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmrghb(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai vec_vmrghb(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static vector short __ATTRS_o_ai vec_vmrghh(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai vec_vmrghh(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai vec_vmrghh(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmrghw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai vec_vmrghw(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai vec_vmrghw(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mergel(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai vec_mergel(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector short __ATTRS_o_ai vec_mergel(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai vec_mergel(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai vec_mergel(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector int __ATTRS_o_ai vec_mergel(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mergel(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai vec_mergel(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai vec_mergel(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector bool long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+#endif
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static vector signed char __ATTRS_o_ai vec_vmrglb(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmrglb(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai vec_vmrglb(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static vector short __ATTRS_o_ai vec_vmrglh(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai vec_vmrglh(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai vec_vmrglh(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmrglw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai vec_vmrglw(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai vec_vmrglw(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+
+#ifdef __POWER8_VECTOR__
+/* vec_mergee */
+
+static vector bool int __ATTRS_o_ai
+vec_mergee(vector bool int __a, vector bool int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static vector signed int __ATTRS_o_ai
+vec_mergee(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergee(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+/* vec_mergeo */
+
+static vector bool int  __ATTRS_o_ai
+vec_mergeo(vector bool int __a, vector bool int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector signed int  __ATTRS_o_ai
+vec_mergeo(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int  __ATTRS_o_ai
+vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#endif
+
+/* vec_mfvscr */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void) {
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static vector signed char __ATTRS_o_ai vec_min(vector signed char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_min(vector bool char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_min(vector signed char __a,
+                                               vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_min(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_min(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_min(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_min(vector short __a, vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_min(vector bool short __a,
+                                         vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_min(vector short __a,
+                                         vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_min(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_min(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_min(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_min(vector int __a, vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_min(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_min(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_min(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_min(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_min(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd((vector signed long long)__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai vec_min(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __builtin_altivec_vminsd(__a, (vector signed long long)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud((vector unsigned long long)__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vminud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_min(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_min(vector double __a,
+                                          vector double __b) {
+  return __builtin_vsx_xvmindp(__a, __b);
+}
+#endif
+
+/* vec_vminsb */
+
+static vector signed char __ATTRS_o_ai vec_vminsb(vector signed char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vminsb(vector bool char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vminsb(vector signed char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static vector unsigned char __ATTRS_o_ai vec_vminub(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vminub(vector bool char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vminub(vector unsigned char __a,
+                                                    vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vminsh(vector bool short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                            vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vminuh(vector unsigned short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static vector int __ATTRS_o_ai vec_vminsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vminsw(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vminsw(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static vector unsigned int __ATTRS_o_ai vec_vminuw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vminuw(vector bool int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vminuw(vector unsigned int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static vector short __ATTRS_o_ai vec_mladd(vector short __a, vector short __b,
+                                           vector short __c) {
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai vec_mladd(vector short __a,
+                                           vector unsigned short __b,
+                                           vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                           vector short __b, vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                                    vector unsigned short __b,
+                                                    vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                               vector short __b,
+                                               vector short __c) {
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                               vector unsigned short __b,
+                                               vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai vec_vmladduhm(vector unsigned short __a,
+                                               vector short __b,
+                                               vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static vector int __ATTRS_o_ai vec_msum(vector signed char __a,
+                                        vector unsigned char __b,
+                                        vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_msum(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai vec_msum(vector short __a, vector short __b,
+                                        vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_msum(vector unsigned short __a,
+                                                 vector unsigned short __b,
+                                                 vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static vector int __ATTRS_o_ai vec_msums(vector short __a, vector short __b,
+                                         vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_msums(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static void __ATTRS_o_ai vec_mtvscr(vector signed char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector unsigned char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector bool char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector unsigned short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector bool short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector pixel __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector unsigned int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector bool int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector float __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* vec_mul */
+
+/* Integer vector multiplication will involve multiplication of the odd/even
+   elements separately, then truncating the results and moving to the
+   result vector.
+*/
+static vector signed char __ATTRS_o_ai vec_mul(vector signed char __a,
+                                               vector signed char __b) {
+  return __a * __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mul(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a * __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_mul(vector signed short __a,
+                                                vector signed short __b) {
+  return __a * __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mul(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a * __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_mul(vector signed int __a,
+                                              vector signed int __b) {
+  return __a * __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mul(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mul(vector signed long long __a, vector signed long long __b) {
+  return __a * __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mul(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a * __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_mul(vector float __a, vector float __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_mul(vector double __a, vector double __b) {
+  return __a * __b;
+}
+#endif
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static vector short __ATTRS_o_ai vec_mule(vector signed char __a,
+                                          vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mule(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_mule(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mule(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai vec_mule(vector signed int __a,
+                                                     vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosw(__a, __b);
+#else
+  return __builtin_altivec_vmulesw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouw(__a, __b);
+#else
+  return __builtin_altivec_vmuleuw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulesb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
+                                          vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mulo(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_mulo(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mulo(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai vec_mulo(vector signed int __a,
+                                                     vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesw(__a, __b);
+#else
+  return __builtin_altivec_vmulosw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuw(__a, __b);
+#else
+  return __builtin_altivec_vmulouw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulosb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/*  vec_nand */
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
+                                                vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
+                                                vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return ~(__a & __b);
+
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                              vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
+                                                 vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
+                                                 vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector bool short __a,
+                                                 vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return ~(__a & __b);
+
+}
+
+static vector bool short __ATTRS_o_ai vec_nand(vector bool short __a,
+                                               vector bool short __b) {
+  return ~(__a & __b);
+
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                               vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                               vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                               vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                             vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+#endif
+
+/* vec_nmadd */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_nmadd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_vsx_xvnmaddasp(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_nmadd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvnmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_nmsub */
+
+static vector float __ATTRS_o_ai
+vec_nmsub(vector float __a, vector float __b, vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvnmsubasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_nmsub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvnmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vnmsubfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static vector signed char __ATTRS_o_ai vec_nor(vector signed char __a,
+                                               vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nor(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_nor(vector bool char __a,
+                                             vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai vec_nor(vector short __a, vector short __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nor(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_nor(vector bool short __a,
+                                              vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai vec_nor(vector int __a, vector int __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nor(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_nor(vector bool int __a,
+                                            vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai vec_nor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_nor(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      ~((vector unsigned long long)__a | (vector unsigned long long)__b);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vnor */
+
+static vector signed char __ATTRS_o_ai vec_vnor(vector signed char __a,
+                                                vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vnor(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vnor(vector bool char __a,
+                                              vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai vec_vnor(vector short __a, vector short __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vnor(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vnor(vector bool short __a,
+                                               vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai vec_vnor(vector int __a, vector int __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vnor(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vnor(vector bool int __a,
+                                             vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai vec_vnor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static vector bool long long __ATTRS_o_ai vec_nor(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return ~(__a | __b);
+}
+#endif
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                              vector signed char __b) {
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_or(vector bool char __a,
+                                              vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                              vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_or(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_or(vector bool char __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_or(vector unsigned char __a,
+                                                vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_or(vector bool char __a,
+                                            vector bool char __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_or(vector short __a, vector short __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_or(vector bool short __a,
+                                        vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_or(vector short __a,
+                                        vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_or(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_or(vector unsigned short __a,
+                                                 vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_or(vector bool short __a,
+                                             vector bool short __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_or(vector int __a, vector int __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_or(vector bool int __a, vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_or(vector int __a, vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_or(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_or(vector bool int __a,
+                                               vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_or(vector unsigned int __a,
+                                               vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_or(vector bool int __a,
+                                           vector bool int __b) {
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai vec_or(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_or(vector bool int __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_or(vector float __a, vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_or(vector bool long long __a,
+                                         vector double __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static vector double __ATTRS_o_ai vec_or(vector double __a,
+                                         vector bool long long __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static vector double __ATTRS_o_ai vec_or(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_or(vector signed long long __a,
+                                                   vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_or(vector bool long long __a,
+                                                 vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
+                                               vector signed char __b) {
+  return __a | ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
+                                               vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                               vector signed char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                             vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
+                                                vector signed short __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
+                                                vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector bool short __a,
+                                                vector signed short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_orc(vector bool short __a,
+                                              vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                              vector signed int __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                              vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                              vector signed int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                            vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_orc(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+#endif
+
+/* vec_vor */
+
+static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
+                                               vector signed char __b) {
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
+                                               vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vor(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vor(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                             vector bool char __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_vor(vector short __a, vector short __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                         vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vor(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vor(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                              vector bool short __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_vor(vector int __a, vector int __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_vor(vector bool int __a, vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_vor(vector int __a, vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vor(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vor(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                            vector bool int __b) {
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai vec_vor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vor(vector bool int __a,
+                                         vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                         vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_vor(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vor(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static vector signed char __ATTRS_o_ai vec_pack(vector signed short __a,
+                                                vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_pack(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_pack(vector bool short __a,
+                                              vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector short __ATTRS_o_ai vec_pack(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_pack(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_pack(vector bool int __a,
+                                               vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+#ifdef __VSX__
+static vector signed int __ATTRS_o_ai vec_pack(vector signed long long __a,
+                                               vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+static vector unsigned int __ATTRS_o_ai
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_pack(vector bool long long __a,
+                                             vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+#endif
+
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static vector signed char __ATTRS_o_ai vec_vpkuhum(vector signed short __a,
+                                                   vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_vpkuhum(vector bool short __a,
+                                                 vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static vector short __ATTRS_o_ai vec_vpkuwum(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vpkuwum(vector unsigned int __a,
+                                                      vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_vpkuwum(vector bool int __a,
+                                                  vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkudum */
+
+#ifdef __POWER8_VECTOR__
+#define __builtin_altivec_vpkudum vec_vpkudum
+
+static vector int __ATTRS_o_ai vec_vpkudum(vector long long __a,
+                                           vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_vpkudum(vector bool long long __a,
+                                                vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+#endif
+
+/* vec_packpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static vector signed char __ATTRS_o_ai vec_packs(vector short __a,
+                                                 vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_packs(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector signed short __ATTRS_o_ai vec_packs(vector int __a,
+                                                  vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_packs(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector int __ATTRS_o_ai vec_packs(vector long long __a,
+                                         vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshss */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpksdss */
+
+#ifdef __POWER8_VECTOR__
+static vector int __ATTRS_o_ai vec_vpksdss(vector long long __a,
+                                           vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkuhus */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkudus */
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __attribute__((__always_inline__))
+vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkswss */
+
+static vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a,
+                                                    vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_packsu(vector unsigned short __a,
+                                                    vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_packsu(vector int __a,
+                                                     vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_packsu(vector unsigned int __a,
+                                                     vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __ATTRS_o_ai vec_packsu(vector long long __a,
+                                                   vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshus */
+
+static vector unsigned char __ATTRS_o_ai vec_vpkshus(vector short __a,
+                                                     vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static vector unsigned short __ATTRS_o_ai vec_vpkswus(vector int __a,
+                                                      vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vpkswus(vector unsigned int __a,
+                                                      vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpksdus */
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __ATTRS_o_ai vec_vpksdus(vector long long __a,
+                                                    vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+static vector signed char __ATTRS_o_ai vec_perm(vector signed char __a,
+                                                vector signed char __b,
+                                                vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                         (vector int)__a, __d);
+#else
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                         (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_perm(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_perm(vector bool char __a,
+                                              vector bool char __b,
+                                              vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                       (vector int)__a, __d);
+#else
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                       (vector int)__b, __c);
+#endif
+}
+
+static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                          vector signed short __b,
+                                          vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_perm(vector bool short __a,
+                                               vector bool short __b,
+                                               vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                        (vector int)__a, __d);
+#else
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                        (vector int)__b, __c);
+#endif
+}
+
+static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
+                                          vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                        vector signed int __b,
+                                        vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector signed int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_perm(vector bool int __a,
+                                             vector bool int __b,
+                                             vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                      (vector int)__a, __d);
+#else
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                      (vector int)__b, __c);
+#endif
+}
+
+static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b,
+                                          vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
+                                              vector signed long long __b,
+                                              vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
+                                           vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__b,
+                                                    (vector int)__a, __d);
+#else
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__a,
+                                                    (vector int)__b, __c);
+#endif
+}
+#endif
+
+/* vec_vperm */
+
+static vector signed char __ATTRS_o_ai vec_vperm(vector signed char __a,
+                                                 vector signed char __b,
+                                                 vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vperm(vector unsigned char __a,
+                                                   vector unsigned char __b,
+                                                   vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool char __ATTRS_o_ai vec_vperm(vector bool char __a,
+                                               vector bool char __b,
+                                               vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector short __ATTRS_o_ai vec_vperm(vector short __a, vector short __b,
+                                           vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vperm(vector unsigned short __a,
+                                                    vector unsigned short __b,
+                                                    vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool short __ATTRS_o_ai vec_vperm(vector bool short __a,
+                                                vector bool short __b,
+                                                vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector pixel __ATTRS_o_ai vec_vperm(vector pixel __a, vector pixel __b,
+                                           vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai vec_vperm(vector int __a, vector int __b,
+                                         vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vperm(vector unsigned int __a,
+                                                  vector unsigned int __b,
+                                                  vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool int __ATTRS_o_ai vec_vperm(vector bool int __a,
+                                              vector bool int __b,
+                                              vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai vec_vperm(vector float __a, vector float __b,
+                                           vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai vec_vperm(vector long long __a,
+                                               vector long long __b,
+                                               vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vperm(vector unsigned long long __a, vector unsigned long long __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai vec_vperm(vector double __a,
+                                            vector double __b,
+                                            vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+#endif
+
+/* vec_re */
+
+static vector float __ATTRS_o_ai
+vec_re(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvresp(__a);
+#else
+  return __builtin_altivec_vrefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_re(vector double __a) {
+  return __builtin_vsx_xvredp(__a);
+}
+#endif
+
+/* vec_vrefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a) {
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static vector signed char __ATTRS_o_ai vec_rl(vector signed char __a,
+                                              vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_rl(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_rl(vector short __a,
+                                        vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_rl(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_rl(vector int __a, vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_rl(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+#endif
+
+/* vec_vrlb */
+
+static vector signed char __ATTRS_o_ai vec_vrlb(vector signed char __a,
+                                                vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vrlb(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static vector short __ATTRS_o_ai vec_vrlh(vector short __a,
+                                          vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vrlh(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static vector int __ATTRS_o_ai vec_vrlw(vector int __a,
+                                        vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vrlw(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static vector float __ATTRS_o_ai vec_round(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspi(__a);
+#else
+  return __builtin_altivec_vrfin(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_round(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+
+/* vec_rint */
+
+static vector float __ATTRS_o_ai
+vec_rint(vector float __a) {
+  return __builtin_vsx_xvrspic(__a);
+}
+
+static vector double __ATTRS_o_ai
+vec_rint(vector double __a) {
+  return __builtin_vsx_xvrdpic(__a);
+}
+
+/* vec_nearbyint */
+
+static vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
+  return __builtin_vsx_xvrspi(__a);
+}
+
+static vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+#endif
+
+/* vec_vrfin */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a) {
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_sqrt */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
+  return __builtin_vsx_xvsqrtsp(__a);
+}
+
+static vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
+  return __builtin_vsx_xvsqrtdp(__a);
+}
+#endif
+
+/* vec_rsqrte */
+
+static vector float __ATTRS_o_ai
+vec_rsqrte(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrsqrtesp(__a);
+#else
+  return __builtin_altivec_vrsqrtefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
+  return __builtin_vsx_xvrsqrtedp(__a);
+}
+#endif
+
+/* vec_vrsqrtefp */
+
+static __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a) {
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static vector signed char __ATTRS_o_ai vec_sel(vector signed char __a,
+                                               vector signed char __b,
+                                               vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai vec_sel(vector signed char __a,
+                                               vector signed char __b,
+                                               vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sel(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sel(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                             vector bool char __b,
+                                             vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                             vector bool char __b,
+                                             vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai vec_sel(vector short __a, vector short __b,
+                                         vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai vec_sel(vector short __a, vector short __b,
+                                         vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sel(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sel(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_sel(vector bool short __a,
+                                              vector bool short __b,
+                                              vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_sel(vector bool short __a,
+                                              vector bool short __b,
+                                              vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai vec_sel(vector int __a, vector int __b,
+                                       vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai vec_sel(vector int __a, vector int __b,
+                                       vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sel(vector unsigned int __a,
+                                                vector unsigned int __b,
+                                                vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sel(vector unsigned int __a,
+                                                vector unsigned int __b,
+                                                vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                            vector bool int __b,
+                                            vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                            vector bool int __b,
+                                            vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai vec_sel(vector float __a, vector float __b,
+                                         vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_sel(vector float __a, vector float __b,
+                                         vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
+                                          vector bool long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                     ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
+                                          vector unsigned long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                     ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vsel */
+
+static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
+                                                vector signed char __b,
+                                                vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
+                                                vector signed char __b,
+                                                vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsel(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsel(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                              vector bool char __b,
+                                              vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                              vector bool char __b,
+                                              vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai vec_vsel(vector short __a, vector short __b,
+                                          vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai vec_vsel(vector short __a, vector short __b,
+                                          vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsel(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsel(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsel(vector bool short __a,
+                                               vector bool short __b,
+                                               vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsel(vector bool short __a,
+                                               vector bool short __b,
+                                               vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai vec_vsel(vector int __a, vector int __b,
+                                        vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai vec_vsel(vector int __a, vector int __b,
+                                        vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsel(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsel(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                             vector bool int __b,
+                                             vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                             vector bool int __b,
+                                             vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai vec_vsel(vector float __a, vector float __b,
+                                          vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vsel(vector float __a, vector float __b,
+                                          vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static vector signed char __ATTRS_o_ai vec_sl(vector signed char __a,
+                                              vector unsigned char __b) {
+  return __a << (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return __a << __b;
+}
+
+static vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                        vector unsigned short __b) {
+  return __a << (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sl(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return __a << __b;
+}
+
+static vector int __ATTRS_o_ai vec_sl(vector int __a, vector unsigned int __b) {
+  return __a << (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sl(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return __a << __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sl(vector signed long long __a, vector unsigned long long __b) {
+  return __a << (vector long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a << __b;
+}
+#endif
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static vector signed char __ATTRS_o_ai vec_vslb(vector signed char __a,
+                                                vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vslb(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static vector short __ATTRS_o_ai vec_vslh(vector short __a,
+                                          vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vslh(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static vector int __ATTRS_o_ai vec_vslw(vector int __a,
+                                        vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vslw(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static vector signed char __ATTRS_o_ai vec_sld(vector signed char __a,
+                                               vector signed char __b,
+                                               unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sld(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_sld(vector bool char __a,
+                                             vector bool char __b,
+                                             unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector signed short __ATTRS_o_ai vec_sld(vector signed short __a,
+                                                vector signed short __b,
+                                                unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sld(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_sld(vector bool short __a,
+                                              vector bool short __b,
+                                              unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai vec_sld(vector pixel __a, vector pixel __b,
+                                         unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector signed int __ATTRS_o_ai vec_sld(vector signed int __a,
+                                              vector signed int __b,
+                                              unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sld(vector unsigned int __a,
+                                                vector unsigned int __b,
+                                                unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
+                                            vector bool int __b,
+                                            unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector float __ATTRS_o_ai vec_sld(vector float __a, vector float __b,
+                                         unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+/* vec_vsldoi */
+
+static vector signed char __ATTRS_o_ai vec_vsldoi(vector signed char __a,
+                                                  vector signed char __b,
+                                                  unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsldoi(vector unsigned char __a,
+                                                    vector unsigned char __b,
+                                                    unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector short __ATTRS_o_ai vec_vsldoi(vector short __a, vector short __b,
+                                            unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsldoi(vector unsigned short __a,
+                                                     vector unsigned short __b,
+                                                     unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a, vector pixel __b,
+                                            unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_vsldoi(vector int __a, vector int __b,
+                                          unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsldoi(vector unsigned int __a,
+                                                   vector unsigned int __b,
+                                                   unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector float __ATTRS_o_ai vec_vsldoi(vector float __a, vector float __b,
+                                            unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+/* vec_sll */
+
+static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_vsl */
+
+static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_slo */
+
+static vector signed char __ATTRS_o_ai vec_slo(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_slo(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_slo(vector unsigned char __a,
+                                                 vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_slo(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                         vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_slo(vector unsigned short __a,
+                                                  vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_slo(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                         vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_slo(vector int __a, vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_slo(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_slo(vector unsigned int __a,
+                                                vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_slo(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                         vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                         vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_vslo */
+
+static vector signed char __ATTRS_o_ai vec_vslo(vector signed char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vslo(vector signed char __a,
+                                                vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vslo(vector unsigned char __a,
+                                                  vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vslo(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                          vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                          vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vslo(vector unsigned short __a,
+                                                   vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vslo(vector unsigned short __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                          vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                          vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                        vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                        vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vslo(vector unsigned int __a,
+                                                 vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vslo(vector unsigned int __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                          vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                          vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static vector signed char __ATTRS_o_ai vec_splat(vector signed char __a,
+                                                 unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_splat(vector unsigned char __a,
+                                                   unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static vector bool char __ATTRS_o_ai vec_splat(vector bool char __a,
+                                               unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static vector signed short __ATTRS_o_ai vec_splat(vector signed short __a,
+                                                  unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_splat(vector unsigned short __a,
+                                                    unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector bool short __ATTRS_o_ai vec_splat(vector bool short __a,
+                                                unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
+                                           unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector signed int __ATTRS_o_ai vec_splat(vector signed int __a,
+                                                unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_splat(vector unsigned int __a,
+                                                  unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai vec_splat(vector bool int __a,
+                                              unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai vec_splat(vector float __a,
+                                           unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_splat(vector double __a,
+                                            unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector bool long long __ATTRS_o_ai vec_splat(vector bool long long __a,
+                                                    unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector signed long long __ATTRS_o_ai
+vec_splat(vector signed long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_splat(vector unsigned long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+#endif
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static vector signed char __ATTRS_o_ai vec_vspltb(vector signed char __a,
+                                                  unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vspltb(vector unsigned char __a,
+                                                    unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_vspltb(vector bool char __a,
+                                                unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static vector short __ATTRS_o_ai vec_vsplth(vector short __a,
+                                            unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsplth(vector unsigned short __a,
+                                                     unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai vec_vsplth(vector bool short __a,
+                                                 unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai vec_vsplth(vector pixel __a,
+                                            unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static vector int __ATTRS_o_ai vec_vspltw(vector int __a, unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vspltw(vector unsigned int __a,
+                                                   unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai vec_vspltw(vector bool int __a,
+                                               unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai vec_vspltw(vector float __a,
+                                            unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai vec_splat_s8(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai vec_vspltisb(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai vec_splat_s16(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai vec_vspltish(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai vec_splat_s32(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai vec_vspltisw(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned char __ATTRS_o_ai vec_splat_u8(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned short __ATTRS_o_ai vec_splat_u16(signed char __a) {
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned int __ATTRS_o_ai vec_splat_u32(signed char __a) {
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static vector signed char __ATTRS_o_ai vec_sr(vector signed char __a,
+                                              vector unsigned char __b) {
+  vector unsigned char __res = (vector unsigned char)__a >> __b;
+  return (vector signed char)__res;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return __a >> __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_sr(vector signed short __a,
+                                        vector unsigned short __b) {
+  vector unsigned short __res = (vector unsigned short)__a >> __b;
+  return (vector signed short)__res;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sr(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return __a >> __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_sr(vector signed int __a,
+                                             vector unsigned int __b) {
+  vector unsigned int __res = (vector unsigned int)__a >> __b;
+  return (vector signed int)__res;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sr(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return __a >> __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sr(vector signed long long __a, vector unsigned long long __b) {
+  vector unsigned long long __res = (vector unsigned long long)__a >> __b;
+  return (vector signed long long)__res;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+#endif
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static vector signed char __ATTRS_o_ai vec_vsrb(vector signed char __a,
+                                                vector unsigned char __b) {
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsrb(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static vector short __ATTRS_o_ai vec_vsrh(vector short __a,
+                                          vector unsigned short __b) {
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsrh(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static vector int __ATTRS_o_ai vec_vsrw(vector int __a,
+                                        vector unsigned int __b) {
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsrw(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static vector signed char __ATTRS_o_ai vec_sra(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sra(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_sra(vector short __a,
+                                         vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sra(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_sra(vector int __a,
+                                       vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sra(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sra(vector signed long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)((vector signed long long)__a >> __b);
+}
+#endif
+
+/* vec_vsrab */
+
+static vector signed char __ATTRS_o_ai vec_vsrab(vector signed char __a,
+                                                 vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsrab(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static vector short __ATTRS_o_ai vec_vsrah(vector short __a,
+                                           vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsrah(vector unsigned short __a,
+                                                    vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static vector int __ATTRS_o_ai vec_vsraw(vector int __a,
+                                         vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsraw(vector unsigned int __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_vsr */
+
+static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_sro */
+
+static vector signed char __ATTRS_o_ai vec_sro(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_sro(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sro(vector unsigned char __a,
+                                                 vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sro(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                         vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sro(vector unsigned short __a,
+                                                  vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sro(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                         vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sro(vector int __a, vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sro(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sro(vector unsigned int __a,
+                                                vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sro(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                         vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                         vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsro */
+
+static vector signed char __ATTRS_o_ai vec_vsro(vector signed char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsro(vector signed char __a,
+                                                vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsro(vector unsigned char __a,
+                                                  vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsro(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                          vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                          vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsro(vector unsigned short __a,
+                                                   vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsro(vector unsigned short __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                          vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                          vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                        vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                        vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsro(vector unsigned int __a,
+                                                 vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsro(vector unsigned int __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                          vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                          vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector short __a, int __b, vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector pixel __a, int __b, vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector float __a, int __b, vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                  vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                  signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                  vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                  unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                  signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                  unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                  vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                  vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                  vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                  unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                  unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                  vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                  unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                  vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                  vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                  unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                  unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                  vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                  vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static void __ATTRS_o_ai vec_ste(vector signed char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector unsigned char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector unsigned short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector unsigned int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static void __ATTRS_o_ai vec_stvebx(vector signed char __a, int __b,
+                                    signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvebx(vector unsigned char __a, int __b,
+                                    unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                    signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                    unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static void __ATTRS_o_ai vec_stvehx(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector unsigned short __a, int __b,
+                                    unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                    short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                    unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                    unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static void __ATTRS_o_ai vec_stvewx(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector unsigned int __a, int __b,
+                                    unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                    unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                 vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                 vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                 vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector short __a, int __b, vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                 vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                 vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b, vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                 vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                 vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector float __a, int __b, vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                   vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                   signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                   vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                   unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                   signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                   unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                   vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                   vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned short __a, int __b,
+                                   vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned short __a, int __b,
+                                   unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                   unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                   vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                   unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                   vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                   vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                   unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                   unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                   vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                   vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static vector signed char __ATTRS_o_ai vec_sub(vector signed char __a,
+                                               vector signed char __b) {
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_sub(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_sub(vector signed char __a,
+                                               vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sub(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sub(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sub(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai vec_sub(vector short __a, vector short __b) {
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                         vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sub(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sub(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai vec_sub(vector int __a, vector int __b) {
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_sub(vector bool int __a, vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_sub(vector int __a, vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sub(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sub(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sub(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai vec_sub(vector signed __int128 __a,
+                                                   vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_sub(vector signed long long __a, vector signed long long __b) {
+  return __a - __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sub(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a - __b;
+}
+
+static vector double __ATTRS_o_ai
+vec_sub(vector double __a, vector double __b) {
+  return __a - __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_sub(vector float __a, vector float __b) {
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static vector signed char __ATTRS_o_ai vec_vsububm(vector signed char __a,
+                                                   vector signed char __b) {
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vsububm(vector bool char __a,
+                                                   vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vsububm(vector signed char __a,
+                                                   vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububm(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububm(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububm(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                             vector short __b) {
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_vsubuhm(vector bool short __a,
+                                             vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                             vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsubuhm(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static vector int __ATTRS_o_ai vec_vsubuwm(vector int __a, vector int __b) {
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                           vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                           vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b) {
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static vector unsigned int __ATTRS_o_ai vec_subc(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector unsigned __int128 __ATTRS_o_ai
+vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static vector signed __int128 __ATTRS_o_ai
+vec_subc(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vsubcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static vector signed char __ATTRS_o_ai vec_subs(vector signed char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_subs(vector bool char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_subs(vector signed char __a,
+                                                vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_subs(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_subs(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_subs(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_subs(vector short __a, vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                          vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                          vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_subs(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_subs(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_subs(vector int __a, vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_subs(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_subs(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_subs(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_subs(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_subs(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static vector signed char __ATTRS_o_ai vec_vsubsbs(vector signed char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsubsbs(vector bool char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsubsbs(vector signed char __a,
+                                                   vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static vector unsigned char __ATTRS_o_ai vec_vsububs(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububs(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububs(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vsubshs(vector bool short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                             vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsubuhs(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static vector int __ATTRS_o_ai vec_vsubsws(vector int __a, vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vsubsws(vector bool int __a,
+                                           vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                           vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vsubuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+
+/* vec_vsubeuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+/* vec_vsubcuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+/* vec_vsubecuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_sum4s */
+
+static vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
+                                         vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sum4s(vector unsigned char __a,
+                                                  vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_sum4s(vector signed short __a,
+                                         vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static vector float __ATTRS_o_ai
+vec_trunc(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspiz(__a);
+#else
+  return __builtin_altivec_vrfiz(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_trunc(vector double __a) {
+  return __builtin_vsx_xvrdpiz(__a);
+}
+#endif
+
+/* vec_vrfiz */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a) {
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static vector short __ATTRS_o_ai vec_unpackh(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_unpackh(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_unpackh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_unpackh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_unpackh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_unpackh(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_unpackh(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vupkhsb */
+
+static vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_vupkhsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static vector int __ATTRS_o_ai vec_vupkhsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_vupkhsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vupkhsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsw */
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_vupkhsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_vupkhsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_unpackl */
+
+static vector short __ATTRS_o_ai vec_unpackl(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_unpackl(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_unpackl(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_unpackl(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_unpackl(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_unpackl(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_unpackl(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vupklsb */
+
+static vector short __ATTRS_o_ai vec_vupklsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_vupklsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static vector int __ATTRS_o_ai vec_vupklsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_vupklsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vupklsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsw */
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_vupklsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_vupklsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vsx_ld */
+
+#ifdef __VSX__
+
+static vector signed int __ATTRS_o_ai vec_vsx_ld(int __a,
+                                                 const vector signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_vsx_ld(int __a, const vector float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed long long *__b) {
+  return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned long long *__b) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static vector double __ATTRS_o_ai vec_vsx_ld(int __a,
+                                             const vector double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+#endif
+
+/* vec_vsx_st */
+
+#ifdef __VSX__
+
+static void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                    vector signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                    vector unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                    vector float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector signed long long __a, int __b,
+                                    vector signed long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector unsigned long long __a, int __b,
+                                    vector unsigned long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                    vector double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+#endif
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static vector signed char __ATTRS_o_ai vec_xor(vector signed char __a,
+                                               vector signed char __b) {
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_xor(vector signed char __a,
+                                               vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                             vector bool char __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_xor(vector short __a, vector short __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                         vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_xor(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_xor(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                              vector bool short __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_xor(vector int __a, vector int __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_xor(vector bool int __a, vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_xor(vector int __a, vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_xor(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_xor(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                            vector bool int __b) {
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai vec_xor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_xor(vector bool int __a,
+                                         vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                         vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_xor(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_xor(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return __a ^ __b;
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                          (vector unsigned long long)__b);
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long) __b);
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+#endif
+
+/* vec_vxor */
+
+static vector signed char __ATTRS_o_ai vec_vxor(vector signed char __a,
+                                                vector signed char __b) {
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vxor(vector signed char __a,
+                                                vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vxor(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vxor(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                              vector bool char __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_vxor(vector short __a, vector short __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                          vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                          vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vxor(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vxor(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                               vector bool short __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_vxor(vector int __a, vector int __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_vxor(vector bool int __a, vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_vxor(vector int __a, vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vxor(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vxor(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                             vector bool int __b) {
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai vec_vxor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                          vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                          vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vxor(vector bool long long __a,
+                                                   vector bool long long __b) {
+  return __a ^ __b;
+}
+#endif
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static signed char __ATTRS_o_ai vec_extract(vector signed char __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai vec_extract(vector unsigned char __a,
+                                              int __b) {
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
+                                              int __b) {
+  return __a[__b];
+}
+
+static signed short __ATTRS_o_ai vec_extract(vector signed short __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai vec_extract(vector unsigned short __a,
+                                               int __b) {
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
+                                               int __b) {
+  return __a[__b];
+}
+
+static signed int __ATTRS_o_ai vec_extract(vector signed int __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai vec_extract(vector bool int __a, int __b) {
+  return __a[__b];
+}
+
+#ifdef __VSX__
+static signed long long __ATTRS_o_ai vec_extract(vector signed long long __a,
+                                                 int __b) {
+  return __a[__b];
+}
+
+static unsigned long long __ATTRS_o_ai
+vec_extract(vector unsigned long long __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned long long __ATTRS_o_ai vec_extract(vector bool long long __a,
+                                                   int __b) {
+  return __a[__b];
+}
+
+static double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
+  return __a[__b];
+}
+#endif
+
+static float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
+  return __a[__b];
+}
+
+/* vec_insert */
+
+static vector signed char __ATTRS_o_ai vec_insert(signed char __a,
+                                                  vector signed char __b,
+                                                  int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                    vector unsigned char __b,
+                                                    int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                vector bool char __b,
+                                                int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_insert(signed short __a,
+                                                   vector signed short __b,
+                                                   int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_insert(unsigned short __a,
+                                                     vector unsigned short __b,
+                                                     int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool short __ATTRS_o_ai vec_insert(unsigned short __a,
+                                                 vector bool short __b,
+                                                 int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_insert(signed int __a,
+                                                 vector signed int __b,
+                                                 int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                                   vector unsigned int __b,
+                                                   int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                               vector bool int __b,
+                                               int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_insert(signed long long __a, vector signed long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+static vector double __ATTRS_o_ai vec_insert(double __a, vector double __b,
+                                             int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_insert(float __a, vector float __b,
+                                            int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static vector signed char __ATTRS_o_ai vec_lvlx(int __a,
+                                                const signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai vec_lvlx(int __a,
+                                                const vector signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvlx(int __a,
+                                                  const unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvlx(int __a,
+                                              const vector bool char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlx(int __a, const short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlx(int __a, const vector short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvlx(int __a,
+                                                   const unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvlx(int __a,
+                                               const vector bool short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvlx(int __a, const vector pixel *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlx(int __a, const int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlx(int __a, const vector int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvlx(int __a,
+                                                 const unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvlx(int __a,
+                                             const vector bool int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlx(int __a, const float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlx(int __a, const vector float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static vector signed char __ATTRS_o_ai vec_lvlxl(int __a,
+                                                 const signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvlxl(int __a,
+                                                   const unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvlxl(int __a,
+                                               const vector bool char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlxl(int __a, const short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlxl(int __a, const vector short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                    const unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                const vector bool short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvlxl(int __a, const vector pixel *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlxl(int __a, const int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlxl(int __a, const vector int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvlxl(int __a,
+                                                  const unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvlxl(int __a,
+                                              const vector bool int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlxl(int __a, const float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlxl(int __a, vector float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static vector signed char __ATTRS_o_ai vec_lvrx(int __a,
+                                                const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai vec_lvrx(int __a,
+                                                const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvrx(int __a,
+                                                  const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvrx(int __a,
+                                              const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrx(int __a, const short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrx(int __a, const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvrx(int __a,
+                                                   const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvrx(int __a,
+                                               const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvrx(int __a, const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrx(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrx(int __a, const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvrx(int __a,
+                                                 const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvrx(int __a,
+                                             const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrx(int __a, const float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrx(int __a, const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static vector signed char __ATTRS_o_ai vec_lvrxl(int __a,
+                                                 const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvrxl(int __a,
+                                                   const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvrxl(int __a,
+                                               const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrxl(int __a, const short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrxl(int __a, const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                    const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvrxl(int __a, const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrxl(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrxl(int __a, const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvrxl(int __a,
+                                                  const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvrxl(int __a,
+                                              const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrxl(int __a, const float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrxl(int __a, const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                   signed char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                   vector signed char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                   unsigned char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                   vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector bool char __a, int __b,
+                                   vector bool char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector short __a, int __b, short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                   vector short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned short __a, int __b,
+                                   unsigned short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned short __a, int __b,
+                                   vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector bool short __a, int __b,
+                                   vector bool short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector pixel __a, int __b,
+                                   vector pixel *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector int __a, int __b, int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector int __a, int __b, vector int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                   unsigned int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                   vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector bool int __a, int __b,
+                                   vector bool int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector float __a, int __b,
+                                   vector float *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvlxl */
+
+static void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                    signed char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                    vector signed char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a, int __b,
+                                    unsigned char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a, int __b,
+                                    vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector bool char __a, int __b,
+                                    vector bool char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b, short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                    vector short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a, int __b,
+                                    unsigned short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a, int __b,
+                                    vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector bool short __a, int __b,
+                                    vector bool short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector pixel __a, int __b,
+                                    vector pixel *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b, int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b, vector int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                    unsigned int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                    vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector bool int __a, int __b,
+                                    vector bool int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector float __a, int __b,
+                                    vector float *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrx */
+
+static void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                   signed char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                   vector signed char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                   unsigned char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                   vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector bool char __a, int __b,
+                                   vector bool char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector short __a, int __b, short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                   vector short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned short __a, int __b,
+                                   unsigned short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned short __a, int __b,
+                                   vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector bool short __a, int __b,
+                                   vector bool short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector pixel __a, int __b,
+                                   vector pixel *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector int __a, int __b, int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector int __a, int __b, vector int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                   unsigned int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                   vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector bool int __a, int __b,
+                                   vector bool int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector float __a, int __b,
+                                   vector float *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrxl */
+
+static void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                    signed char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                    vector signed char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a, int __b,
+                                    unsigned char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a, int __b,
+                                    vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector bool char __a, int __b,
+                                    vector bool char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b, short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                    vector short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a, int __b,
+                                    unsigned short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a, int __b,
+                                    vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector bool short __a, int __b,
+                                    vector bool short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector pixel __a, int __b,
+                                    vector pixel *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b, int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b, vector int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                    unsigned int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                    vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector bool int __a, int __b,
+                                    vector bool int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,
+                                    vector float *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_promote */
+
+static vector signed char __ATTRS_o_ai vec_promote(signed char __a, int __b) {
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_promote(unsigned char __a,
+                                                     int __b) {
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_promote(unsigned short __a,
+                                                      int __b) {
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a, int __b) {
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static vector signed char __ATTRS_o_ai vec_splats(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_splats(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+static vector short __ATTRS_o_ai vec_splats(short __a) {
+  return (vector short)(__a);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_splats(unsigned short __a) {
+  return (vector unsigned short)(__a);
+}
+
+static vector int __ATTRS_o_ai vec_splats(int __a) { return (vector int)(__a); }
+
+static vector unsigned int __ATTRS_o_ai vec_splats(unsigned int __a) {
+  return (vector unsigned int)(__a);
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai vec_splats(signed long long __a) {
+  return (vector signed long long)(__a);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_splats(unsigned long long __a) {
+  return (vector unsigned long long)(__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai vec_splats(signed __int128 __a) {
+  return (vector signed __int128)(__a);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_splats(unsigned __int128 __a) {
+  return (vector unsigned __int128)(__a);
+}
+
+#endif
+
+static vector double __ATTRS_o_ai vec_splats(double __a) {
+  return (vector double)(__a);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_splats(float __a) {
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                   vector long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_eq(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_eq(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_ge */
+
+static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
+}
+static int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_ge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_ge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_gt */
+
+static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
+}
+static int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_gt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_gt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_in */
+
+static int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_le(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_le(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_lt */
+
+static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_lt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_lt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_nan */
+
+static int __ATTRS_o_ai vec_all_nan(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __a);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_nan(vector double __a) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __a);
+}
+#endif
+
+/* vec_all_ne */
+
+static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_ne(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_ne(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nge */
+
+static int __ATTRS_o_ai
+vec_all_nge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai
+vec_all_nge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_ngt */
+
+static int __ATTRS_o_ai
+vec_all_ngt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai
+vec_all_ngt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nle */
+
+static int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_eq(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_eq(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_ge */
+
+static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_ge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_ge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_gt */
+
+static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_gt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_gt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_le */
+
+static int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_le(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_le(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_lt */
+
+static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_lt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_lt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_nan */
+
+static int __attribute__((__always_inline__)) vec_any_nan(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_ne(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_ne(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
+/* vec_any_nge */
+
+static int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* Power 8 Crypto functions
+Note: We diverge from the current GCC implementation with regard
+to cryptography and related functions as follows:
+- Only the SHA and AES instructions and builtins are disabled by -mno-crypto
+- The remaining ones are only available on Power8 and up so
+  require -mpower8-vector
+The justification for this is that export requirements require that
+Category:Vector.Crypto is optional (i.e. compliant hardware may not provide
+support). As a result, we need to be able to turn off support for those.
+The remaining ones (currently controlled by -mcrypto for GCC) still
+need to be provided on compliant hardware even if Vector.Crypto is not
+provided.
+*/
+#ifdef __CRYPTO__
+#define vec_sbox_be __builtin_altivec_crypto_vsbox
+#define vec_cipher_be __builtin_altivec_crypto_vcipher
+#define vec_cipherlast_be __builtin_altivec_crypto_vcipherlast
+#define vec_ncipher_be __builtin_altivec_crypto_vncipher
+#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vsbox(vector unsigned long long __a) {
+  return __builtin_altivec_crypto_vsbox(__a);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipher(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast(vector unsigned long long __a,
+                             vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipherlast(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipher(vector unsigned long long __a,
+                          vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast(vector unsigned long long __a,
+                              vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipherlast(__a, __b);
+}
+
+#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
+#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+
+#define vec_shasigma_be(X, Y, Z) \
+  _Generic((X), vector unsigned int: __builtin_crypto_vshasigmaw, \
+                vector unsigned long long: __builtin_crypto_vshasigmad) \
+((X), (Y), (Z))
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
+                          vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned short __a, vector unsigned short __b,
+                          vector unsigned short __c) {
+  return (vector unsigned short)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai __builtin_crypto_vpermxor(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (vector unsigned int)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static vector unsigned long long __ATTRS_o_ai __builtin_crypto_vpermxor(
+    vector unsigned long long __a, vector unsigned long long __b,
+    vector unsigned long long __c) {
+  return (vector unsigned long long)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_crypto_vpmsumb(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_crypto_vpmsumh(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_crypto_vpmsumw(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vpmsumd(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vgbbd (vector signed char __a)
+{
+  return __builtin_altivec_vgbbd((vector unsigned char) __a);
+}
+
+#define vec_pmsum_be __builtin_crypto_vpmsumb
+#define vec_gb __builtin_altivec_vgbbd
+
+static vector unsigned char __ATTRS_o_ai vec_vgbbd (vector unsigned char __a)
+{
+  return __builtin_altivec_vgbbd(__a);
+}
+
+static vector long long __ATTRS_o_ai
+vec_vbpermq (vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vbpermq((vector unsigned char) __a,
+                                   (vector unsigned char) __b);
+}
+
+static vector long long __ATTRS_o_ai
+vec_vbpermq (vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vbpermq(__a, __b);
+}
+
+#ifdef __powerpc64__
+static vector unsigned long long __attribute__((__always_inline__))
+vec_bperm (vector unsigned __int128 __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char) __a,
+                                   (vector unsigned char) __b);
+}
+#endif
+#endif
+
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */

diff --git a/24.0.3/clang-include/ammintrin.h b/24.0.3/clang-include/ammintrin.h
new file mode 100644
index 0000000..4880fd7
--- /dev/null
+++ b/24.0.3/clang-include/ammintrin.h

@@ -0,0 +1,209 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index idx and of the length len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// \code
+/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// \endcode
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than
+///    64, the result is undefined. If the length and index are both zero,
+///    bits [63:0] of parameter x are extracted. If the length is zero
+///    but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+///    extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// \endcode
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8]
+///    and the length at [5:0]; all other bits are ignored.
+///    If bits [5:0] are zero, the length is interpreted as 64.
+///    If the sum of the index and length is greater than 64, the result is
+///    undefined. If the length and index are both zero, bits [63:0] of
+///    parameter __x are extracted. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+///    from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    y into the lower 64 bits of the destination integer vector x at the
+///    index idx and of the length len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// \code
+/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// \endcode
+///
+/// \param x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length len and by the index idx specifying the least
+///    significant bit.
+/// \param y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand y of length len.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than
+///    64, the result is undefined. If the length and index are both zero,
+///    bits [63:0] of parameter y are inserted into parameter x. If the
+///    length is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits
+///    of destination operand x with the specified bitfields replaced by the
+///    lower bits of source operand y. The upper 64 bits of the return value
+///    are undefined.
+
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    __y into the lower 64 bits of the destination integer vector __x at
+///    the index and of the length specified by __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// \endcode
+///
+/// \param __x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
+///    specified by operand __y.
+/// \param __y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand __y with length specified
+///    by bits [69:64]. These are inserted into the destination at the index
+///    specified by bits [77:72]; all other bits are ignored.
+///    If bits [69:64] are zero, the length is interpreted as 64.
+///    If the sum of the index and length is greater than 64, the result is
+///    undefined. If the length and index are both zero, bits [63:0] of
+///    parameter __y are inserted into parameter __x. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits
+///    of destination operand __x with the specified bitfields replaced by the
+///    lower bits of source operand __y. The upper 64 bits of the return value
+///    are undefined.
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c MOVNTSD instruction.
+/// \endcode
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to
+///    be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c MOVNTSS instruction.
+/// \endcode
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to
+///    be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */

diff --git a/24.0.3/clang-include/arm_acle.h b/24.0.3/clang-include/arm_acle.h
new file mode 100644
index 0000000..4be1d09
--- /dev/null
+++ b/24.0.3/clang-include/arm_acle.h

@@ -0,0 +1,308 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 8.3 Memory barriers */
+#if !defined(_MSC_VER)
+#define __dmb(i) __builtin_arm_dmb(i)
+#define __dsb(i) __builtin_arm_dsb(i)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 8.4 Hints */
+
+#if !defined(_MSC_VER)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+/* 8.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __swp(uint32_t x, volatile uint32_t *p) {
+  uint32_t v;
+  do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
+  return v;
+}
+
+/* 8.6 Memory prefetch intrinsics */
+/* 8.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 8.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 8.7 NOP */
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+
+/* 9 DATA-PROCESSING INTRINSICS */
+/* 9.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __ror(uint32_t x, uint32_t y) {
+  y %= 32;
+  if (y == 0)  return x;
+  return (x >> y) | (x << (32 - y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rorll(uint64_t x, uint32_t y) {
+  y %= 64;
+  if (y == 0)  return x;
+  return (x >> y) | (x << (64 - y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rorl(unsigned long x, uint32_t y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(x, y);
+#else
+  return __rorll(x, y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __clz(uint32_t t) {
+  return __builtin_clz(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __clzl(unsigned long t) {
+  return __builtin_clzl(t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __clzll(uint64_t t) {
+  return __builtin_clzll(t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rev(uint32_t t) {
+  return __builtin_bswap32(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __revl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(t);
+#else
+  return __builtin_bswap64(t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __revll(uint64_t t) {
+  return __builtin_bswap64(t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rev16(uint32_t t) {
+  return __ror(__rev(t), 16);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rev16ll(uint64_t t) {
+  return (((uint64_t)__rev16(t >> 32)) << 32) | __rev16(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rev16l(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(t);
+#else
+    return __rev16ll(t);
+#endif
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+  __revsh(int16_t t) {
+  return __builtin_bswap16(t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rbit(uint32_t t) {
+  return __builtin_arm_rbit(t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rbitll(uint64_t t) {
+#if __ARM_32BIT_STATE
+  return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
+    __builtin_arm_rbit(t >> 32);
+#else
+  return __builtin_arm_rbit64(t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rbitl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(t);
+#else
+  return __rbitll(t);
+#endif
+}
+
+/*
+ * 9.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 9.4.1 Width-specified saturation intrinsics */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 9.4.2 Saturating addition and subtraction intrinsics */
+#if __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+  __qadd(int32_t t, int32_t v) {
+  return __builtin_arm_qadd(t, v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+  __qsub(int32_t t, int32_t v) {
+  return __builtin_arm_qsub(t, v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t t) {
+  return __builtin_arm_qadd(t, t);
+}
+#endif
+
+/* 9.7 CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32b(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32b(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32h(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32h(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32w(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32w(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32d(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32d(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cb(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32cb(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32ch(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32ch(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cw(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32cw(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cd(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32cd(a, b);
+}
+#endif
+
+/* 10.1 Special register intrinsics */
+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */

diff --git a/24.0.3/clang-include/avx2intrin.h b/24.0.3/clang-include/avx2intrin.h
new file mode 100644
index 0000000..f786572
--- /dev/null
+++ b/24.0.3/clang-include/avx2intrin.h

@@ -0,0 +1,1215 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a + (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a + (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a + (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({        \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                     (__v32qi)(__m256i)(b), (n)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({       \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
+                                   (__v16hi)(__m256i)(V2),   \
+                                   (((M) & 0x01) ? 16 : 0),  \
+                                   (((M) & 0x02) ? 17 : 1),  \
+                                   (((M) & 0x04) ? 18 : 2),  \
+                                   (((M) & 0x08) ? 19 : 3),  \
+                                   (((M) & 0x10) ? 20 : 4),  \
+                                   (((M) & 0x20) ? 21 : 5),  \
+                                   (((M) & 0x40) ? 22 : 6),  \
+                                   (((M) & 0x80) ? 23 : 7),  \
+                                   (((M) & 0x01) ? 24 : 8),  \
+                                   (((M) & 0x02) ? 25 : 9),  \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a == __b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  /* This function always performs a signed comparison, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m256i)((__v32qs)__a > (__v32qs)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a > __b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a * (__v16hi)__b);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a * (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
+                                   (__v8si)_mm256_setzero_si256(), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_setzero_si256(), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) & 0x03) >> 0), \
+                                   12 + (((imm) & 0x0c) >> 2), \
+                                   12 + (((imm) & 0x30) >> 4), \
+                                   12 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_setzero_si256(), \
+                                   (imm) & 0x3,((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) & 0x03) >> 0), \
+                                   8 + (((imm) & 0x0c) >> 2), \
+                                   8 + (((imm) & 0x30) >> 4), \
+                                   8 + (((imm) & 0xc0) >> 6), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, count) __extension__ ({ \
+  (__m256i)__builtin_ia32_pslldqi256((__m256i)(a), (count)*8); })
+
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, count) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrldqi256((__m256i)(a), (count)*8); })
+
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a - (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a - (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a - (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_stream_load_si256(__m256i const *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_broadcastsd_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
+                                   (__v4si)(__m128i)(V2),  \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
+                                   (__v8si)(__m256i)(V2),   \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector(__X, __X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector(__X, __X, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
+                                   (__v4df)_mm256_setzero_pd(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_setzero_si256(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
+
+#define _mm256_extracti128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_setzero_si256(), \
+                                   (((M) & 1) ? 2 : 0), \
+                                   (((M) & 1) ? 3 : 1) ); })
+
+#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
+                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+                                   (((M) & 1) ? 0 : 4), \
+                                   (((M) & 1) ? 1 : 5), \
+                                   (((M) & 1) ? 4 : 2), \
+                                   (((M) & 1) ? 5 : 3) ); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)(__m256)(mask), (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                       (int const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8si)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                       (int const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                             _mm256_setzero_ps(), \
+                                                             _CMP_EQ_OQ), \
+                                       (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                            _mm_setzero_ps()), \
+                                       (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v4si)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                       (int const *)(m), (__v8si)(__m256i)(i), \
+                                       (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v2di)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                       (int const *)(m), (__v4di)(__m256i)(i), \
+                                       (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX2INTRIN_H */

diff --git a/24.0.3/clang-include/avx512bwintrin.h b/24.0.3/clang-include/avx512bwintrin.h
new file mode 100644
index 0000000..f289ed7
--- /dev/null
+++ b/24.0.3/clang-include/avx512bwintrin.h

@@ -0,0 +1,1542 @@
+/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BWINTRIN_H
+#define __AVX512BWINTRIN_H
+
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+
+static  __inline __v64qi __DEFAULT_FN_ATTRS
+_mm512_setzero_qi(void) {
+  return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static  __inline __v32hi __DEFAULT_FN_ATTRS
+_mm512_setzero_hi(void) {
+  return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/* Integer compare */
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qi) __A + (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) __W,
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) _mm512_setzero_qi(),
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qi) __A - (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) __W,
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) _mm512_setzero_qi(),
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hi) __A + (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) _mm512_setzero_hi(),
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hi) __A - (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) _mm512_setzero_hi(),
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hi) __A * (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+         __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+              (__v32hi) __I /* idx */ ,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+              /* idx */ ,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+         __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+      __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)_mm256_setzero_si256(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)__O,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi) _mm256_setzero_si256(),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) __O,
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+         __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+          __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+         __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+          __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), \
+                                         (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), \
+                                         (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), \
+                                          (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), \
+                                          (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), \
+                                         (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), \
+                                         (p), (__mmask32)(m)); })
+
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), \
+                                          (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), \
+                                          (p), (__mmask32)(m)); })
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/avx512cdintrin.h b/24.0.3/clang-include/avx512cdintrin.h
new file mode 100644
index 0000000..3894b29
--- /dev/null
+++ b/24.0.3/clang-include/avx512cdintrin.h

@@ -0,0 +1,131 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+               (__v8di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/avx512dqintrin.h b/24.0.3/clang-include/avx512dqintrin.h
new file mode 100644
index 0000000..afee490
--- /dev/null
+++ b/24.0.3/clang-include/avx512dqintrin.h

@@ -0,0 +1,778 @@
+/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512DQINTRIN_H
+#define __AVX512DQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v8di) __A * (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_xor_pd (__m512d __A, __m512d __B) {
+  return (__m512d) ((__v8di) __A ^ (__v8di) __B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_xor_ps (__m512 __A, __m512 __B) {
+  return (__m512) ((__v16si) __A ^ (__v16si) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_or_pd (__m512d __A, __m512d __B) {
+  return (__m512d) ((__v8di) __A | (__v8di) __B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+            (__v8df) __B,
+            (__v8df) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+            (__v8df) __B,
+            (__v8df)
+            _mm512_setzero_pd (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_or_ps (__m512 __A, __m512 __B) {
+  return (__m512) ((__v16si) __A | (__v16si) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_and_pd (__m512d __A, __m512d __B) {
+  return (__m512d) ((__v8di) __A & (__v8di) __B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_and_ps (__m512 __A, __m512 __B) {
+  return (__m512) ((__v16si) __A & (__v16si) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_andnot_pd (__m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+              (__v8df) __B,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+              (__v8df) __B,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+              (__v8df) __B,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_andnot_ps (__m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+             (__v16sf) __B,
+             (__v16sf)
+             _mm512_setzero_ps (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+             (__v16sf) __B,
+             (__v16sf) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+             (__v16sf) __B,
+             (__v16sf)
+             _mm512_setzero_ps (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,               \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,                 \
+                (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundpd_epi64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,        \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu64(__A, __R) __extension__ ({               \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,               \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundpd_epu64(__U, __A, __R) __extension__ ({     \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi64(__A, __R) __extension__ ({             \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,              \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,                 \
+                (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundps_epi64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,               \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,                \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundps_epu64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_pd(__A, __R) __extension__ ({          \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,           \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,                 \
+                (__v8df) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepi64_pd(__U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,             \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) __W,
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ps(__A, __R) __extension__ ({        \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,          \
+               (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,                  \
+               (__v8sf) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepi64_ps(__U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,              \
+               (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi64(__A, __R) __extension__ ({             \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,                 \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundpd_epi64(__U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,             \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epu64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,                \
+                  (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundpd_epu64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi64(__A, __R) __extension__ ({            \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,             \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,                 \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundps_epi64(__U, __A, __R) __extension__ ({  \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epu64(__A, __R) __extension__ ({            \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,            \
+                  (__v8di) _mm512_setzero_si512(),(__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,                \
+                  (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundps_epu64(__U, __A, __R) __extension__ ({  \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,             \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_pd(__A, __R) __extension__ ({          \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,          \
+                 (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,                \
+                 (__v8df) __W, (__mmask8) __U, __R);})
+
+
+#define _mm512_maskz_cvt_roundepu64_pd(__U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,            \
+                 (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ps(__A, __R) __extension__ ({         \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,          \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,                 \
+                (__v8sf) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepu64_ps(__U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,             \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+
+#define _mm512_range_pd(__A, __B, __C) __extension__ ({                     \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({      \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
+               (__v8df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) __U,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_range_round_pd(__A, __B, __C, __R) __extension__ ({           \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_range_round_pd(__W, __U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,      \
+               (__v8df) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_range_round_pd(__U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,   \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_range_ps(__A, __B, __C) __extension__ ({                       \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B, __C, \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({         \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,       \
+               __C, (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_range_ps(__U, __A, __B, __C) __extension__ ({      \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,(__v16sf) __B, \
+              __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U,      \
+              _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_range_round_ps(__A, __B, __C, __R) __extension__ ({         \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,   \
+                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+
+#define _mm512_mask_range_round_ps(__W, __U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,          \
+                __C, (__v16sf) __W, (__mmask16) __U, __R);})
+
+#define _mm512_maskz_range_round_ps(__U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,      \
+                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+
+#define _mm512_reduce_pd(__A, __B) __extension__ ({             \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
+                (__v8df) __W,(__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_reduce_pd(__U, __A, __B) __extension__ ({  \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_reduce_ps(__A, __B) __extension__ ({              \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({   \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_reduce_ps(__U, __A, __B) __extension__ ({       \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_reduce_round_pd(__A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_reduce_round_pd(__W, __U, __A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
+                (__v8df) __W,(__mmask8) __U, __R);})
+
+#define _mm512_maskz_reduce_round_pd(__U, __A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_reduce_round_ps(__A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+
+#define _mm512_mask_reduce_round_ps(__W, __U, __A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) __W, (__mmask16) __U, __R);})
+
+#define _mm512_maskz_reduce_round_ps(__U, __A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/avx512erintrin.h b/24.0.3/clang-include/avx512erintrin.h
new file mode 100644
index 0000000..40a9121
--- /dev/null
+++ b/24.0.3/clang-include/avx512erintrin.h

@@ -0,0 +1,285 @@
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(S), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_pd(A) \
+   _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+   _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+   _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(S), \
+                                     (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_ps(A) \
+   _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+   _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+   _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+// rsqrt28
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(S), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(S), \
+                                        (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(M), (R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(S), \
+                                        (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(S), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+// rcp28
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), \
+                                      (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (R)); })
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)_mm_setzero_ps(), \
+                                      (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)(__m128)(S), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)_mm_setzero_ps(), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)_mm_setzero_pd(), \
+                                       (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)(__m128d)(S), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)_mm_setzero_pd(), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif // __AVX512ERINTRIN_H

diff --git a/24.0.3/clang-include/avx512fintrin.h b/24.0.3/clang-include/avx512fintrin.h
new file mode 100644
index 0000000..8dcdc71
--- /dev/null
+++ b/24.0.3/clang-include/avx512fintrin.h

@@ -0,0 +1,3074 @@
+/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FINTRIN_H
+#define __AVX512FINTRIN_H
+
+typedef double __v8df __attribute__((__vector_size__(64)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef long long __v8di __attribute__((__vector_size__(64)));
+typedef int __v16si __attribute__((__vector_size__(64)));
+
+typedef float __m512 __attribute__((__vector_size__(64)));
+typedef double __m512d __attribute__((__vector_size__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+/* Rounding mode macros.  */
+#define _MM_FROUND_TO_NEAREST_INT   0x00
+#define _MM_FROUND_TO_NEG_INF       0x01
+#define _MM_FROUND_TO_POS_INF       0x02
+#define _MM_FROUND_TO_ZERO          0x03
+#define _MM_FROUND_CUR_DIRECTION    0x04
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+
+/* Create vectors with repeated elements */
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_si512(void)
+{
+  return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_undefined_pd()
+{
+  return (__m512d)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined()
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined_ps()
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_undefined_epi32()
+{
+  return (__m512i)__builtin_ia32_undef512();
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+#ifdef __x86_64__
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#endif
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_setzero_ps(void)
+{
+  return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+static  __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_setzero_pd(void)
+{
+  return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set1_ps(float __w)
+{
+  return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+                   __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set1_pd(double __w)
+{
+  return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi32(int __s)
+{
+  return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
+                             __s, __s, __s, __s, __s, __s, __s, __s };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi64(long long __d)
+{
+  return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcastss_ps(__m128 __X)
+{
+  float __f = __X[0];
+  return (__v16sf){ __f, __f, __f, __f,
+                    __f, __f, __f, __f,
+                    __f, __f, __f, __f,
+                    __f, __f, __f, __f };
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcastsd_pd(__m128d __X)
+{
+  double __d = __X[0];
+  return (__v8df){ __d, __d, __d, __d,
+                   __d, __d, __d, __d };
+}
+
+/* Cast between vector types */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd128(__m512d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps128(__m512 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+/* Bitwise operators */
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W, __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_pd (),
+              __U);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+/* Arithmetic */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_add_pd(__m512d __a, __m512d __b)
+{
+  return __a + __b;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_add_ps(__m512 __a, __m512 __b)
+{
+  return __a + __b;
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mul_pd(__m512d __a, __m512d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mul_ps(__m512 __a, __m512 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_sub_pd(__m512d __a, __m512d __b)
+{
+  return __a - __b;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_sub_ps(__m512 __a, __m512 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8di) __A + (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8di) __A - (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16si) __A + (__v16si) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16si) __A - (__v16si) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_max_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_max_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_max_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_max_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_max_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_max_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_max_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_min_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_min_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_min_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_min_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_min_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_min_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_min_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di) __W, __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epu32(__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di)
+               _mm512_setzero_si512 (),
+               (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di) __W, __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di)
+               _mm512_setzero_si512 (),
+               __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16si) __A * (__v16si) __B);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W, __M);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_sqrt_pd(__m512d __a)
+{
+  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
+                                                (__v8df) _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_sqrt_ps(__m512 __a)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                 (__v8df)
+                 _mm512_setzero_pd (),
+                 (__mmask8) -1);}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1);
+}
+
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) -1);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rcp14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+               (__v8df)
+               _mm512_setzero_pd (),
+               (__mmask8) -1);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rcp14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+              (__v16sf)
+              _mm512_setzero_ps (),
+              (__mmask16) -1);
+}
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) -1);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_floor_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_floor_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_FLOOR,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_ceil_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_ceil_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_CEIL,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi64(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi32(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_add_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_add_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_add_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_add_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_add_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_add_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_add_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_add_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask((__v8df) __A, (__v8df) __B, \
+                (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_add_round_pd(__U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R); })
+
+#define _mm512_add_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R); })
+
+#define _mm512_mask_add_round_ps(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) __W, (__mmask16)__U, __R); })
+
+#define _mm512_maskz_add_round_ps(__U, __A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) _mm512_setzero_ps(), (__mmask16)__U, __R); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_sub_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_sub_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_sub_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_sub_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_sub_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_sub_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_sub_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_sub_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_sub_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_sub_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_sub_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_sub_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_mul_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_mul_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_mul_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mul_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_mul_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_mul_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mul_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_mul_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_mul_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_mul_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_mul_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_mul_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_div_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_div_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_div_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_div_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_div_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_div_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_div_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_div_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_div_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_div_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \
+                                         -1, _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \
+                                          -1, _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
+                                              (__v8df) (B), -(__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
+                                              (__v8df) (B), -(__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
+                                             (__v16sf) (B), -(__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
+                                             (__v16sf) (B), -(__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), (__v8df) (C), \
+                                                (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), (__v8df) (C), \
+                                                (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), -(__v8df) (C), \
+                                                (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), -(__v8df) (C), \
+                                                (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
+                                                 (__v8df) (B), -(__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        -(__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), (__v16sf) (C), \
+                                               (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), (__v16sf) (C), \
+                                               (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), -(__v16sf) (C), \
+                                               (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), -(__v16sf) (C), \
+                                               (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
+                                                (__v16sf) (B), -(__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       -(__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) (A), \
+                                               (__v8df) (B), (__v8df) (C), \
+                                               (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) (A), \
+                                              (__v16sf) (B), (__v16sf) (C), \
+                                              (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+
+
+/* Vector permutations */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16si) __A,
+                                                       (__v16si) __B,
+                                                       (__mmask16) -1);
+}
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__mmask8) -1);
+}
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__mmask16) -1);
+}
+
+#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
+                                         (__v8di)(__m512i)(B), \
+                                         (I), (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)-1); })
+
+#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+                                         (__v16si)(__m512i)(B), \
+                                         (I), (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)-1); })
+
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
+      (__m256d)                                                          \
+        __builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A),           \
+                                         (I),                            \
+                                         (__v4df)_mm256_setzero_si256(), \
+                                         (__mmask8) -1); })
+
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
+      (__m128)                                                           \
+        __builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A),           \
+                                         (I),                            \
+                                         (__v4sf)_mm_setzero_ps(),       \
+                                         (__mmask8) -1); })
+
+/* Vector Blend */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+/* Compare */
+
+#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (P), (__mmask16)-1, (R)); })
+
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (P), (__mmask16)(U), (R)); })
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (P), (__mmask8)-1, (R)); })
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (P), (__mmask8)(U), (R)); })
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+/* Conversion */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu32(__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (R)); })
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_pd(__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_pd(__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) -1);
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (R)); })
+
+#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(A), (I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            -1); })
+
+static  __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtph_ps(__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi32(__m512 __a)
+{
+  return (__m512i)
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
+                                     (__v16si) _mm512_setzero_si512 (),
+                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi32(__m512d __a)
+{
+  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
+                                                   (__v8si)_mm256_setzero_si256(),
+                                                   (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (R)); })
+
+#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)-1, (R)); })
+
+#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8) -1, (R)); })
+
+/* Unpack and Interleave */
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 2,    18,    3,    19,
+                                 2+4,  18+4,  3+4,  19+4,
+                                 2+8,  18+8,  3+8,  19+8,
+                                 2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 0,    16,    1,    17,
+                                 0+4,  16+4,  1+4,  17+4,
+                                 0+8,  16+8,  1+8,  17+8,
+                                 0+12, 16+12, 1+12, 17+12);
+}
+
+/* Bit Test */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_test_epi32_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+            (__v16si) __B,
+            (__mmask16) -1);
+}
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS
+_mm512_test_epi64_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+                 (__v8di) __B,
+                 (__mmask8) -1);
+}
+
+/* SIMD load ops */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m512d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m512 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_load_ps(double const *__p)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_load_pd(float const *__p)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+/* SIMD store ops */
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A,
+                                     (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A,
+                                     (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_pd(void *__P, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_ps(void *__P, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_pd(void *__P, __m512d __A)
+{
+  *(__m512d*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_ps(void *__P, __m512 __A)
+{
+  *(__m512*)__P = __A;
+}
+
+/* Mask ops */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_knot(__mmask16 __M)
+{
+  return __builtin_ia32_knothi(__M);
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                __u);
+}
+
+#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (p), \
+                                         (__mmask16)-1); })
+
+#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (p), \
+                                          (__mmask16)-1); })
+
+#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (p), \
+                                        (__mmask8)-1); })
+
+#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (p), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (p), \
+                                         (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (p), \
+                                          (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (p), \
+                                        (__mmask8)(m)); })
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (p), \
+                                         (__mmask8)(m)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __AVX512FINTRIN_H

diff --git a/24.0.3/clang-include/avx512vlbwintrin.h b/24.0.3/clang-include/avx512vlbwintrin.h
new file mode 100644
index 0000000..b4542d6
--- /dev/null
+++ b/24.0.3/clang-include/avx512vlbwintrin.h

@@ -0,0 +1,2336 @@
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBWINTRIN_H
+#define __AVX512VLBWINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi)
+             _mm256_setzero_si256 (),
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi)
+             _mm256_setzero_si256 (),
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi)
+             _mm_setzero_si128 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi)
+             _mm_setzero_si128 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+              (__v8hi) __B,
+              (__v8hi) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+              (__v8hi) __B,
+              (__v8hi)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) _mm_setzero_si128 (), __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) __W,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) __W,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+               (__v8hi) __I /* idx */ ,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+         __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+               (__v16hi) __I /* idx */ ,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+               (__v16qi) __Y,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+               (__v16qi) __Y,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+         __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+               (__v32qi) __Y,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+               (__v32qi) __Y,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v4si) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v8si) _mm256_setzero_si256(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi8 (__m128i __A) {
+
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+               (__v8hi) __Y,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+               (__v8hi) __Y,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+               (__v16hi) __Y,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+               (__v16hi) __Y,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+          __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+       __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+          __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+       __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+      __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+         __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+          __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+      __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+         __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+          __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), \
+                                         (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), \
+                                         (p), (__mmask16)(m)); })
+
+#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), \
+                                          (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), \
+                                          (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), \
+                                         (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), \
+                                         (p), (__mmask32)(m)); })
+
+#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), \
+                                          (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), \
+                                          (p), (__mmask32)(m)); })
+
+#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), \
+                                         (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), \
+                                         (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), \
+                                          (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), \
+                                          (p), (__mmask16)(m)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLBWINTRIN_H */

diff --git a/24.0.3/clang-include/avx512vldqintrin.h b/24.0.3/clang-include/avx512vldqintrin.h
new file mode 100644
index 0000000..dfd858e
--- /dev/null
+++ b/24.0.3/clang-include/avx512vldqintrin.h

@@ -0,0 +1,953 @@
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLDQINTRIN_H
+#define __AVX512VLDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) ((__v4di) __A * (__v4di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) ((__v2di) __A * (__v2di) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+              (__v4df) __B,
+              (__v4df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+              (__v4df) __B,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+             (__v8sf) __B,
+             (__v8sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+             (__v8sf) __B,
+             (__v8sf)
+             _mm256_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+            (__v4df) __B,
+            (__v4df) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+            (__v4df) __B,
+            (__v4df)
+            _mm256_setzero_pd (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+                 (__v8sf) __B,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+                 (__v8sf) __B,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+#define _mm_range_pd(__A, __B, __C) __extension__ ({                         \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+
+#define _mm_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({          \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) __W, (__mmask8) __U); })
+
+#define _mm_maskz_range_pd(__U, __A, __B, __C) __extension__ ({              \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+
+#define _mm256_range_pd(__A, __B, __C) __extension__ ({                      \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+
+#define _mm256_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({       \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_range_ps(__A, __B, __C) __extension__ ({                         \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+#define _mm_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({          \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) __W, (__mmask8) __U); })
+
+#define _mm_maskz_range_ps(__U, __A, __B, __C) __extension__ ({              \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_range_ps(__A, __B, __C) __extension__ ({                      \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({       \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_range_ps(__U, __A, __B, __C) __extension__ ({           \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+#define _mm_reduce_pd(__A, __B) __extension__ ({                \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+
+#define _mm_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) __W, (__mmask8) __U); })
+
+#define _mm_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+
+#define _mm256_reduce_pd(__A, __B) __extension__ ({                \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+
+#define _mm256_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_reduce_ps(__A, __B) __extension__ ({                   \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+#define _mm_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({    \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) __W, (__mmask8) __U); })
+
+#define _mm_maskz_reduce_ps(__U, __A, __B) __extension__ ({        \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_reduce_ps(__A, __B) __extension__ ({                \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({ \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_reduce_ps(__U, __A, __B) __extension__ ({     \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/avx512vlintrin.h b/24.0.3/clang-include/avx512vlintrin.h
new file mode 100644
index 0000000..8f13536
--- /dev/null
+++ b/24.0.3/clang-include/avx512vlintrin.h

@@ -0,0 +1,4606 @@
+/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLINTRIN_H
+#define __AVX512VLINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
+#define __DEFAULT_FN_ATTRS_BOTH __attribute__((__always_inline__, __nodebug__, __target__("avx512vl, avx512bw")))
+
+/* Integer compare */
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                __u);
+}
+
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+           __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+              (__v8si) __Y,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+              (__v8si) __Y,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
+        __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+              (__v4si) __Y,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+              (__v4si) __Y,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
+           __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+               (__v8si) __Y,
+               (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+               (__v8si) __Y,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
+        __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+               (__v4si) __Y,
+               (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+               (__v4si) __Y,
+               (__v2di)
+               _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+            (__v8si) __B,
+            (__v8si) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+            (__v8si) __B,
+            (__v8si)
+            _mm256_setzero_si256 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+            (__v4si) __B,
+            (__v4si) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+            (__v4si) __B,
+            (__v4si)
+            _mm_setzero_si128 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W, __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_pd (),
+             __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W, __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_pd (),
+             __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_pd (),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_pd (),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+            (__v4di) __B,
+            (__v4di) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+            (__v4di) __B,
+            (__v4di)
+            _mm256_setzero_si256 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+            (__v2di) __B,
+            (__v2di) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+            (__v2di) __B,
+            (__v2di)
+            _mm_setzero_si128 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
+                                         (__v4df)(__m256)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
+                                         (__v4df)(__m256)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm128_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm128_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm128_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
+                                         (__v2df)(__m128)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm128_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
+                                         (__v2df)(__m128)(b), \
+                                         (p), (__mmask8)(m)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    -(__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    -(__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   -(__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       -(__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        -(__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       -(__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        -(__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      -(__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       -(__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      -(__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       -(__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df)
+                  _mm_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df)
+                  _mm256_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
+  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+            (__v2df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
+  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+            (__v4df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+            (__v2di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+            (__v4di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
+  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+            (__v4sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
+  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+            (__v8sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+            (__v4si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+            (__v8si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu32_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_pd (__m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu32_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_ps (__m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+              (__v2df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+               (__v2di)
+               _mm_setzero_si128 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+                   (__v4sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+               (__v4si)
+               _mm_setzero_si128 (),
+               (__mmask8)     __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_pd (__m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_getexp_pd (__m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ps (__m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_getexp_ps (__m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi64 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi64 (__m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+#define _mm_roundscale_pd(__A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, \
+                   __imm, (__v2df) _mm_setzero_pd (), (__mmask8) -1); })
+
+
+#define _mm_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
+                   (__v2df) __W, (__mmask8) __U); })
+
+
+#define _mm_maskz_roundscale_pd(__U, __A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
+                   (__v2df) _mm_setzero_pd (), (__mmask8) __U); })
+
+
+#define _mm256_roundscale_pd(__A, __imm) __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) _mm256_setzero_pd (), (__mmask8) -1); })
+
+
+#define _mm256_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) __W, (__mmask8) __U); })
+
+
+#define _mm256_maskz_roundscale_pd(__U, __A, __imm)  __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_roundscale_ps(__A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+
+#define _mm_mask_roundscale_ps(__W, __U, __A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) __W, (__mmask8) __U); })
+
+
+#define _mm_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_roundscale_ps(__A, __imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,__imm, \
+                  (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_roundscale_ps(__W, __U, __A,__imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
+                  (__v8sf) __W, (__mmask8) __U); })
+
+
+#define _mm256_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
+                  (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_pd (__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_scalef_pd (__m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ps (__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_scalef_ps (__m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_i64scatter_pd(__addr,__index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df(__addr, (__mmask8) 0xFF, (__v2di) __index, \
+                              (__v2df) __v1, __scale); })
+
+#define _mm_mask_i64scatter_pd(__addr, __mask, __index, __v1, \
+                               __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, \
+                               (__v2df) __v1, __scale); })
+
+
+#define _mm_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, \
+        (__v2di) __index, (__v2di) __v1, __scale); })
+
+#define _mm_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
+                                  __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,\
+        (__v2di) __v1, __scale); })
+
+#define _mm256_i64scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,\
+        (__v4di) __index, (__v4df) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_pd(__addr, __mask, __index, __v1,\
+                                   __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,\
+        (__v4df) __v1, __scale); })
+
+#define _mm256_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, (__v4di) __index,\
+                               (__v4di) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
+                                      __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,\
+        (__v4di) __v1, __scale); })
+
+#define _mm_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,\
+        (__v2di) __index, (__v4sf) __v1, __scale); })
+
+#define _mm_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
+                                __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,\
+        (__v4sf) __v1, __scale); })
+
+#define _mm_i64scatter_epi32(__addr, __index, __v1, \
+                              __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,\
+        (__v2di) __index, (__v4si) __v1, __scale); })
+
+#define _mm_mask_i64scatter_epi32(__addr, __mask, __index, __v1,\
+         __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,\
+        (__v4si) __v1, __scale); })
+
+#define _mm256_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, (__v4di) __index, \
+                              (__v4sf) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
+                                   __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, \
+        (__v4sf) __v1, __scale); })
+
+#define _mm256_i64scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, \
+        (__v4di) __index, (__v4si) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_epi32(__addr, __mask, __index, __v1, \
+                                      __scale) __extension__ ({  \
+  __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di) __index, \
+        (__v4si) __v1, __scale); })
+
+#define _mm_i32scatter_pd(__addr, __index, __v1,         \
+                          __scale) __extension__ ({      \
+  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, \
+        (__v4si) __index, (__v2df) __v1, __scale); })
+
+#define _mm_mask_i32scatter_pd(__addr, __mask, __index, __v1,    \
+                                __scale) __extension__ ({        \
+  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,\
+         (__v2df) __v1, __scale); })
+
+#define _mm_i32scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,                       \
+        (__v4si) __index, (__v2di) __v1, __scale); })
+
+#define _mm_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
+         __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, \
+        (__v2di) __v1, __scale); })
+
+#define _mm256_i32scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,                      \
+        (__v4si) __index, (__v4df) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_pd(__addr, __mask, __index, __v1, \
+         __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, \
+        (__v4df) __v1, __scale); })
+
+#define _mm256_i32scatter_epi64(__addr, __index, __v1,    \
+                                __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,  \
+        (__v4si) __index, (__v4di) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
+            __scale) __extension__ ({                               \
+  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,   \
+        (__v4di) __v1, __scale); })
+
+#define _mm_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,                   \
+        (__v4si) __index, (__v4sf) __v1, __scale); })
+
+#define _mm_mask_i32scatter_ps(__addr, __mask, __index, __v1,     \
+                               __scale) __extension__ ({          \
+  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, \
+        (__v4sf) __v1, __scale); })
+
+#define _mm_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,                       \
+        (__v4si) __index, (__v4si) __v1, __scale); })
+
+#define _mm_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
+                                  __scale) __extension__ ({      \
+  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,\
+        (__v4si) __v1, __scale); })
+
+#define _mm256_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,                      \
+        (__v8si) __index, (__v8sf) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_ps(__addr, __mask, __index, __v1, \
+                                   __scale) __extension__ ({     \
+  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,\
+        (__v8sf) __v1, __scale); })
+
+#define _mm256_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,                         \
+        (__v8si) __index, (__v8si) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
+            __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,    \
+        (__v8si) __v1, __scale); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+              (__v2df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+              (__v4df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+             (__v8sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+             (__v8sf)
+             _mm256_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+              (__v2di) __I
+              /* idx */ ,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+              (__v4di) __I
+              /* idx */ ,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+                   (__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+                   (__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+               /* idx */ ,
+               (__v2df) __A,
+               (__v2df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+               /* idx */ ,
+               (__v4df) __A,
+               (__v4df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+        __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4sf) __A,
+              (__v4sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8sf) __A,
+              (__v8sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+              /* idx */ ,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8)
+              __U);
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+              /* idx */ ,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8)
+              __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_BOTH
+
+#endif /* __AVX512VLINTRIN_H */

diff --git a/24.0.3/clang-include/avxintrin.h b/24.0.3/clang-include/avxintrin.h
new file mode 100644
index 0000000..6d1ca54
--- /dev/null
+++ b/24.0.3/clang-include/avxintrin.h

@@ -0,0 +1,1318 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v32qs __attribute__((__vector_size__(32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+
+/* Arithmetic */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
+
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
+
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a | (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a | (__v8si)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a ^ (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a ^ (__v8si)__b);
+}
+
+/* Horizontal arithmetic */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
+}
+
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
+                                   (__v2df)_mm_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
+                                   (__v4df)_mm256_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1, \
+                                   2 + (((C) & 0x4) >> 2), \
+                                   2 + (((C) & 0x8) >> 3)); })
+
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
+                                  (__v4sf)_mm_setzero_ps(), \
+                                   (C) & 0x3, ((C) & 0xc) >> 2, \
+                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
+                                  (__v8sf)_mm256_setzero_ps(), \
+                                  (C) & 0x3, ((C) & 0xc) >> 2, \
+                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
+                                  4 + (((C) & 0x03) >> 0), \
+                                  4 + (((C) & 0x0c) >> 2), \
+                                  4 + (((C) & 0x30) >> 4), \
+                                  4 + (((C) & 0xc0) >> 6)); })
+
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                           (__v4df)(__m256d)(V2), (M)); })
+
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                          (__v8sf)(__m256)(V2), (M)); })
+
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                           (__v8si)(__m256i)(V2), (M)); })
+
+/* Vector Blend */
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
+                                   (__v4df)(__m256d)(V2), \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                 (__v8sf)(__m256)(V2), (M)); })
+
+/* Vector shuffle */
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+        (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                        (__v8sf)(__m256)(b), \
+                                        (mask) & 0x3, \
+                                        ((mask) & 0xc) >> 2, \
+                                        (((mask) & 0x30) >> 4) + 8, \
+                                        (((mask) & 0xc0) >> 6) + 8, \
+                                        ((mask) & 0x3) + 4, \
+                                        (((mask) & 0xc) >> 2) + 4, \
+                                        (((mask) & 0x30) >> 4) + 12, \
+                                        (((mask) & 0xc0) >> 6) + 12); })
+
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+        (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), \
+                                         (mask) & 0x1, \
+                                         (((mask) & 0x2) >> 1) + 4, \
+                                         (((mask) & 0x4) >> 2) + 2, \
+                                         (((mask) & 0x8) >> 3) + 6); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), (c)); })
+
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), (c)); })
+
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi32(__m256i __a, const int __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi16(__m256i __a, const int __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return __b[__imm & 15];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi8(__m256i __a, const int __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return __b[__imm & 31];
+}
+
+#ifdef __x86_64__
+static __inline long long  __DEFAULT_FN_ATTRS
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+/* Vector replicate */
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
+}
+
+/* SIMD load ops */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  __builtin_ia32_storeupd256(__p, (__v4df)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_storeups256(__p, (__v8sf)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
+}
+
+/* Conditional load ops */
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_maskload_pd(double const *__p, __m128i __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_maskload_pd(double const *__p, __m256i __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4di)__m);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_maskload_ps(float const *__p, __m128i __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_maskload_ps(float const *__p, __m256i __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
+}
+
+/* Conditional store ops */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+}
+
+/* Create vectors */
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_undefined_pd()
+{
+  return (__m256d)__builtin_ia32_undef256();
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_undefined_ps()
+{
+  return (__m256)__builtin_ia32_undef256();
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_undefined_si256()
+{
+  return (__m256i)__builtin_ia32_undef256();
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+              float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+                 int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+                 short __w11, short __w10, short __w09, short __w08,
+                 short __w07, short __w06, short __w05, short __w04,
+                 short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+                char __b27, char __b26, char __b25, char __b24,
+                char __b23, char __b22, char __b21, char __b20,
+                char __b19, char __b18, char __b17, char __b16,
+                char __b15, char __b14, char __b13, char __b12,
+                char __b11, char __b10, char __b09, char __b08,
+                char __b07, char __b06, char __b05, char __b04,
+                char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+               float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+                  int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+       short __w11, short __w10, short __w09, short __w08,
+       short __w07, short __w06, short __w05, short __w04,
+       short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+                 char __b27, char __b26, char __b25, char __b24,
+                 char __b23, char __b22, char __b21, char __b20,
+                 char __b19, char __b18, char __b17, char __b16,
+                 char __b15, char __b14, char __b13, char __b12,
+                 char __b11, char __b10, char __b09, char __b08,
+                 char __b07, char __b06, char __b05, char __b04,
+                 char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+/*
+   Vector insert.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V1), \
+    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
+    (((M) & 1) ?  0 :  8), \
+    (((M) & 1) ?  1 :  9), \
+    (((M) & 1) ?  2 : 10), \
+    (((M) & 1) ?  3 : 11), \
+    (((M) & 1) ?  8 :  4), \
+    (((M) & 1) ?  9 :  5), \
+    (((M) & 1) ? 10 :  6), \
+    (((M) & 1) ? 11 :  7) );})
+
+#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V1), \
+    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V1), \
+    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/*
+   Vector extract.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+#define _mm256_extractf128_ps(V, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V), \
+    (__v8sf)(_mm256_setzero_ps()), \
+    (((M) & 1) ? 4 : 0), \
+    (((M) & 1) ? 5 : 1), \
+    (((M) & 1) ? 6 : 2), \
+    (((M) & 1) ? 7 : 3) );})
+
+#define _mm256_extractf128_pd(V, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V), \
+    (__v4df)(_mm256_setzero_pd()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+#define _mm256_extractf128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V), \
+    (__v4di)(_mm256_setzero_si256()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/* SIMD load ops (unaligned) */
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
+  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
+  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  __m256i __v256 = _mm256_castsi128_si256(
+    ((struct __loadu_si128*)__addr_lo)->__v);
+  return _mm256_insertf128_si256(__v256,
+                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
+}
+
+/* SIMD store ops (unaligned) */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  __builtin_ia32_storeups(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  __builtin_ia32_storeups(__addr_hi, __v128);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  __builtin_ia32_storeupd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  __builtin_ia32_storeupd(__addr_hi, __v128);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_m128 (__m128 __hi, __m128 __lo) {
+  return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_m128d (__m128d __hi, __m128d __lo) {
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_m128i (__m128i __hi, __m128i __lo) {
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
+  return _mm256_set_m128(__hi, __lo);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVXINTRIN_H */

diff --git a/24.0.3/clang-include/bmi2intrin.h b/24.0.3/clang-include/bmi2intrin.h
new file mode 100644
index 0000000..fdae82c
--- /dev/null
+++ b/24.0.3/clang-include/bmi2intrin.h

@@ -0,0 +1,95 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */

diff --git a/24.0.3/clang-include/bmiintrin.h b/24.0.3/clang-include/bmiintrin.h
new file mode 100644
index 0000000..da98792
--- /dev/null
+++ b/24.0.3/clang-include/bmiintrin.h

@@ -0,0 +1,155 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+#define _blsr_u32(a)      (__blsr_u32((a)))
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+  return __X ? __builtin_ctzs(__X) : 16;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+#define _blsr_u64(a)      (__blsr_u64((a)))
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __RELAXED_FN_ATTRS
+
+#endif /* __BMIINTRIN_H */

diff --git a/24.0.3/clang-include/cpuid.h b/24.0.3/clang-include/cpuid.h
new file mode 100644
index 0000000..5da02e0
--- /dev/null
+++ b/24.0.3/clang-include/cpuid.h

@@ -0,0 +1,209 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Responses identification request with %eax 0 */
+/* AMD:     "AuthenticAMD" */
+#define signature_AMD_ebx 0x68747541
+#define signature_AMD_edx 0x69746e65
+#define signature_AMD_ecx 0x444d4163
+/* CENTAUR: "CentaurHauls" */
+#define signature_CENTAUR_ebx 0x746e6543
+#define signature_CENTAUR_edx 0x48727561
+#define signature_CENTAUR_ecx 0x736c7561
+/* CYRIX:   "CyrixInstead" */
+#define signature_CYRIX_ebx 0x69727943
+#define signature_CYRIX_edx 0x736e4978
+#define signature_CYRIX_ecx 0x64616574
+/* INTEL:   "GenuineIntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+/* TM1:     "TransmetaCPU" */
+#define signature_TM1_ebx 0x6e617254
+#define signature_TM1_edx 0x74656d73
+#define signature_TM1_ecx 0x55504361
+/* TM2:     "GenuineTMx86" */
+#define signature_TM2_ebx 0x756e6547
+#define signature_TM2_edx 0x54656e69
+#define signature_TM2_ecx 0x3638784d
+/* NSC:     "Geode by NSC" */
+#define signature_NSC_ebx 0x646f6547
+#define signature_NSC_edx 0x43534e20
+#define signature_NSC_ecx 0x79622065
+/* NEXGEN:  "NexGenDriven" */
+#define signature_NEXGEN_ebx 0x4778654e
+#define signature_NEXGEN_edx 0x72446e65
+#define signature_NEXGEN_ecx 0x6e657669
+/* RISE:    "RiseRiseRise" */
+#define signature_RISE_ebx 0x65736952
+#define signature_RISE_edx 0x65736952
+#define signature_RISE_ecx 0x65736952
+/* SIS:     "SiS SiS SiS " */
+#define signature_SIS_ebx 0x20536953
+#define signature_SIS_edx 0x20536953
+#define signature_SIS_ecx 0x20536953
+/* UMC:     "UMC UMC UMC " */
+#define signature_UMC_ebx 0x20434d55
+#define signature_UMC_edx 0x20434d55
+#define signature_UMC_ecx 0x20434d55
+/* VIA:     "VIA VIA VIA " */
+#define signature_VIA_ebx 0x20414956
+#define signature_VIA_edx 0x20414956
+#define signature_VIA_ecx 0x20414956
+/* VORTEX:  "Vortex86 SoC" */
+#define signature_VORTEX_ebx 0x74726f56
+#define signature_VORTEX_edx 0x36387865
+#define signature_VORTEX_ecx 0x436f5320
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE42       0x00100000
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_RDRND       0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}

diff --git a/24.0.3/clang-include/cuda_builtin_vars.h b/24.0.3/clang-include/cuda_builtin_vars.h
new file mode 100644
index 0000000..901356b
--- /dev/null
+++ b/24.0.3/clang-include/cuda_builtin_vars.h

@@ -0,0 +1,110 @@
+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_BUILTIN_VARS_H
+#define __CUDA_BUILTIN_VARS_H
+
+// The file implements built-in CUDA variables using __declspec(property).
+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
+// All read accesses of built-in variable fields get converted into calls to a
+// getter function which in turn would call appropriate builtin to fetch the
+// value.
+//
+// Example:
+//    int x = threadIdx.x;
+// IR output:
+//  %0 = call i32 @llvm.ptx.read.tid.x() #3
+// PTX output:
+//  mov.u32     %r2, %tid.x;
+
+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
+  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
+  static inline __attribute__((always_inline))                                 \
+      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
+    return INTRINSIC;                                                          \
+  }
+
+#if __cplusplus >= 201103L
+#define __DELETE =delete
+#else
+#define __DELETE
+#endif
+
+// Make sure nobody can create instances of the special varible types.  nvcc
+// also disallows taking address of special variables, so we disable address-of
+// operator as well.
+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
+  __attribute__((device)) TypeName() __DELETE;                                 \
+  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
+  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
+  __attribute__((device)) TypeName *operator&() const __DELETE
+
+struct __cuda_builtin_threadIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
+};
+
+struct __cuda_builtin_blockIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
+};
+
+struct __cuda_builtin_blockDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
+};
+
+struct __cuda_builtin_gridDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
+};
+
+#define __CUDA_BUILTIN_VAR                                                     \
+  extern const __attribute__((device)) __attribute__((weak))
+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
+
+// warpSize should translate to read of %WARP_SZ but there's currently no
+// builtin to do so. According to PTX v4.2 docs 'to date, all target
+// architectures have a WARP_SZ value of 32'.
+__attribute__((device)) const int warpSize = 32;
+
+#undef __CUDA_DEVICE_BUILTIN
+#undef __CUDA_BUILTIN_VAR
+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+
+#endif /* __CUDA_BUILTIN_VARS_H */

diff --git a/24.0.3/clang-include/emmintrin.h b/24.0.3/clang-include/emmintrin.h
new file mode 100644
index 0000000..cfc2c71
--- /dev/null
+++ b/24.0.3/clang-include/emmintrin.h

@@ -0,0 +1,1491 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v16qs __attribute__((__vector_size__(16)));
+
+#include <f16cintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd(__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtps_pd(__m128 __a)
+{
+  return __builtin_ia32_cvtps2pd(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2pd((__v4si)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi(__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector(__u, __u, 1, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_undefined_pd()
+{
+  return (__m128d)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  struct __mm_store1_pd_struct {
+    double __u[2];
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
+  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  __builtin_ia32_storeupd(__dp, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a + (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a + (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a + (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_paddq(__a, __b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a * (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a - (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a - (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a - (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psubq(__a, __b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return __a ^ __b;
+}
+
+#define _mm_slli_si128(a, imm) __extension__ ({                         \
+  (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
+                                   (__v16qi)(__m128i)(a),               \
+                                   ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
+
+#define _mm_bslli_si128(a, imm) \
+  _mm_slli_si128((a), (imm))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+#define _mm_srli_si128(a, imm) __extension__ ({                          \
+  (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
+                                   (__v16qi)_mm_setzero_si128(),         \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
+
+#define _mm_bsrli_si128(a, imm) \
+  _mm_srli_si128((a), (imm))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64(__a);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq(__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq(__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_undefined_si128()
+{
+  return (__m128i)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64x(long long __q1, long long __q0)
+{
+  return (__m128i){ __q0, __q1 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64(__m64 __q1, __m64 __q0)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi64(__m64 __q0, __m64 __q1)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntpd(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_ia32_movntdq(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflush(void const *__p)
+{
+  __builtin_ia32_clflush(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_lfence(void)
+{
+  __builtin_ia32_lfence();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mfence(void)
+{
+  __builtin_ia32_mfence();
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
+                                   (__v4si)_mm_setzero_si128(), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_setzero_si128(), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7); })
+
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_setzero_si128(), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd(__a);
+}
+
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                   (i) & 1, (((i) & 2) >> 1) + 2); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_pause(void)
+{
+  __builtin_ia32_pause();
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __EMMINTRIN_H */

diff --git a/24.0.3/clang-include/f16cintrin.h b/24.0.3/clang-include/f16cintrin.h
new file mode 100644
index 0000000..c655d98
--- /dev/null
+++ b/24.0.3/clang-include/f16cintrin.h

@@ -0,0 +1,45 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __F16CINTRIN_H */

diff --git a/24.0.3/clang-include/float.h b/24.0.3/clang-include/float.h
new file mode 100644
index 0000000..238cf76
--- /dev/null
+++ b/24.0.3/clang-include/float.h

@@ -0,0 +1,124 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ */
+#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
+    __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  undef DECIMAL_DIG
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#define DECIMAL_DIG __DECIMAL_DIG__
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+#endif /* __FLOAT_H */

diff --git a/24.0.3/clang-include/fma4intrin.h b/24.0.3/clang-include/fma4intrin.h
new file mode 100644
index 0000000..f117887
--- /dev/null
+++ b/24.0.3/clang-include/fma4intrin.h

@@ -0,0 +1,230 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMA4INTRIN_H */

diff --git a/24.0.3/clang-include/fmaintrin.h b/24.0.3/clang-include/fmaintrin.h
new file mode 100644
index 0000000..114a143
--- /dev/null
+++ b/24.0.3/clang-include/fmaintrin.h

@@ -0,0 +1,228 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMAINTRIN_H */

diff --git a/24.0.3/clang-include/fxsrintrin.h b/24.0.3/clang-include/fxsrintrin.h
new file mode 100644
index 0000000..ac6026a
--- /dev/null
+++ b/24.0.3/clang-include/fxsrintrin.h

@@ -0,0 +1,55 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p) {
+  return __builtin_ia32_fxsave(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p) {
+  return __builtin_ia32_fxsave64(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p) {
+  return __builtin_ia32_fxrstor(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p) {
+  return __builtin_ia32_fxrstor64(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/htmintrin.h b/24.0.3/clang-include/htmintrin.h
new file mode 100644
index 0000000..0088c7c
--- /dev/null
+++ b/24.0.3/clang-include/htmintrin.h

@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED        0x1
+#define _HTM_TRANSACTIONAL    0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin  */
+#define _HTM_TBEGIN_STARTED       0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT     2
+#define _HTM_TBEGIN_PERSISTENT    3
+
+/* The abort codes below this threshold are reserved for machine use.  */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+   of Operation chapter 5-91.  */
+
+struct __htm_tdb {
+  unsigned char format;                /*   0 */
+  unsigned char flags;
+  unsigned char reserved1[4];
+  unsigned short nesting_depth;
+  unsigned long long abort_code;       /*   8 */
+  unsigned long long conflict_token;   /*  16 */
+  unsigned long long atia;             /*  24 */
+  unsigned char eaid;                  /*  32 */
+  unsigned char dxc;
+  unsigned char reserved2[2];
+  unsigned int program_int_id;
+  unsigned long long exception_id;     /*  40 */
+  unsigned long long bea;              /*  48 */
+  unsigned char reserved3[72];         /*  56 */
+  unsigned long long gprs[16];         /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure.  */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *tdb, int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_null(retry) : \
+   __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_nofloat_null(retry) : \
+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */

diff --git a/24.0.3/clang-include/htmxlintrin.h b/24.0.3/clang-include/htmxlintrin.h
new file mode 100644
index 0000000..c7571ec
--- /dev/null
+++ b/24.0.3/clang-include/htmxlintrin.h

@@ -0,0 +1,363 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) \
+  ((texasr_t *)((TM_BUF)+0))
+#define _TEXASRU_PTR(TM_BUF) \
+  ((texasru_t *)((TM_BUF)+0))
+#define _TEXASRL_PTR(TM_BUF) \
+  ((texasrl_t *)((TM_BUF)+4))
+#define _TFIAR_PTR(TM_BUF) \
+  ((tfiar_t *)((TM_BUF)+8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully
+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+   below.  */
+#define _HTM_TBEGIN_STARTED     1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const TM_buff)
+{
+  *_TEXASRL_PTR (TM_buff) = 0;
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+  *_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
+#else
+  *_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
+#endif
+  *_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+  if (__builtin_expect (__builtin_tend (0), 1))
+    return 1;
+  return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+  __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const code)
+{
+  __builtin_tabort (code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+  __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+  __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+
+  *code = _TEXASRU_FAILURE_CODE (texasru);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const TM_buff)
+{
+  texasrl_t texasrl;
+
+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+    {
+      texasrl = *_TEXASRL_PTR (TM_buff);
+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+        texasrl = 0;
+    }
+  else
+    texasrl = (texasrl_t) __builtin_get_texasr ();
+
+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
+     14 (Translation Invalidation Conflict).  */
+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const TM_buff)
+{
+  return *_TFIAR_PTR (TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const TM_buff)
+{
+  return *_TEXASR_PTR (TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+   the IBM XL compiler.  For documentation please see the "z/OS XL
+   C/C++ Programming Guide" publically available on the web.  */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+  return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const tdb)
+{
+  return __builtin_tbegin_nofloat (tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+  return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const code)
+{
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const addr, long long const value)
+{
+  __builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const tdb_ptr)
+{
+  int depth = __builtin_tx_nesting_depth ();
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (depth != 0)
+    return depth;
+
+  if (tdb->format != 1)
+    return 0;
+  return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+    {
+      *code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      return 1;
+    }
+  return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
+	      || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 7 /* fetch overflow */
+	      || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 9 /* fetch conflict */
+	      || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const result)
+{
+  return result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H  */

diff --git a/24.0.3/clang-include/ia32intrin.h b/24.0.3/clang-include/ia32intrin.h
new file mode 100644
index 0000000..5adf3f1
--- /dev/null
+++ b/24.0.3/clang-include/ia32intrin.h

@@ -0,0 +1,101 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned long long __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popq %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __asm__ __volatile__ ("pushq %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned int __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popl %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __asm__ __volatile__ ("pushl %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtsc */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtsc(void) {
+  return __builtin_ia32_rdtsc();
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#endif /* __IA32INTRIN_H */

diff --git a/24.0.3/clang-include/immintrin.h b/24.0.3/clang-include/immintrin.h
new file mode 100644
index 0000000..f3c6d19
--- /dev/null
+++ b/24.0.3/clang-include/immintrin.h

@@ -0,0 +1,172 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#include <wmmintrin.h>
+
+#include <avxintrin.h>
+
+#include <avx2intrin.h>
+
+/* The 256-bit versions of functions in f16cintrin.h.
+   Intel documents these as being in immintrin.h, and
+   they depend on typedefs from avxintrin.h. */
+
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <lzcntintrin.h>
+
+#include <fmaintrin.h>
+
+#include <avx512fintrin.h>
+
+#include <avx512vlintrin.h>
+
+#include <avx512bwintrin.h>
+
+#include <avx512cdintrin.h>
+
+#include <avx512dqintrin.h>
+
+#include <avx512vlbwintrin.h>
+
+#include <avx512vldqintrin.h>
+
+#include <avx512erintrin.h>
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrgsbase64(__V);
+}
+#endif
+
+#include <rtmintrin.h>
+
+#include <xtestintrin.h>
+
+#include <shaintrin.h>
+
+#include <fxsrintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <xsavecintrin.h>
+
+#include <xsavesintrin.h>
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
+#include <adxintrin.h>
+
+#endif /* __IMMINTRIN_H */

diff --git a/24.0.3/clang-include/inttypes.h b/24.0.3/clang-include/inttypes.h
new file mode 100644
index 0000000..3d59d14
--- /dev/null
+++ b/24.0.3/clang-include/inttypes.h

@@ -0,0 +1,102 @@
+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_INTTYPES_H
+#define __CLANG_INTTYPES_H
+
+#include_next <inttypes.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
+ * This triggers format warnings, so fix it up here. */
+#undef PRId32
+#undef PRIdLEAST32
+#undef PRIdFAST32
+#undef PRIi32
+#undef PRIiLEAST32
+#undef PRIiFAST32
+#undef PRIo32
+#undef PRIoLEAST32
+#undef PRIoFAST32
+#undef PRIu32
+#undef PRIuLEAST32
+#undef PRIuFAST32
+#undef PRIx32
+#undef PRIxLEAST32
+#undef PRIxFAST32
+#undef PRIX32
+#undef PRIXLEAST32
+#undef PRIXFAST32
+
+#undef SCNd32
+#undef SCNdLEAST32
+#undef SCNdFAST32
+#undef SCNi32
+#undef SCNiLEAST32
+#undef SCNiFAST32
+#undef SCNo32
+#undef SCNoLEAST32
+#undef SCNoFAST32
+#undef SCNu32
+#undef SCNuLEAST32
+#undef SCNuFAST32
+#undef SCNx32
+#undef SCNxLEAST32
+#undef SCNxFAST32
+
+#define PRId32 "d"
+#define PRIdLEAST32 "d"
+#define PRIdFAST32 "d"
+#define PRIi32 "i"
+#define PRIiLEAST32 "i"
+#define PRIiFAST32 "i"
+#define PRIo32 "o"
+#define PRIoLEAST32 "o"
+#define PRIoFAST32 "o"
+#define PRIu32 "u"
+#define PRIuLEAST32 "u"
+#define PRIuFAST32 "u"
+#define PRIx32 "x"
+#define PRIxLEAST32 "x"
+#define PRIxFAST32 "x"
+#define PRIX32 "X"
+#define PRIXLEAST32 "X"
+#define PRIXFAST32 "X"
+
+#define SCNd32 "d"
+#define SCNdLEAST32 "d"
+#define SCNdFAST32 "d"
+#define SCNi32 "i"
+#define SCNiLEAST32 "i"
+#define SCNiFAST32 "i"
+#define SCNo32 "o"
+#define SCNoLEAST32 "o"
+#define SCNoFAST32 "o"
+#define SCNu32 "u"
+#define SCNuLEAST32 "u"
+#define SCNuFAST32 "u"
+#define SCNx32 "x"
+#define SCNxLEAST32 "x"
+#define SCNxFAST32 "x"
+#endif
+
+#endif /* __CLANG_INTTYPES_H */

diff --git a/24.0.3/clang-include/iso646.h b/24.0.3/clang-include/iso646.h
new file mode 100644
index 0000000..dca13c5
--- /dev/null
+++ b/24.0.3/clang-include/iso646.h

@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */

diff --git a/24.0.3/clang-include/limits.h b/24.0.3/clang-include/limits.h
new file mode 100644
index 0000000..f04187c
--- /dev/null
+++ b/24.0.3/clang-include/limits.h

@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */

diff --git a/24.0.3/clang-include/lzcntintrin.h b/24.0.3/clang-include/lzcntintrin.h
new file mode 100644
index 0000000..4c00e42
--- /dev/null
+++ b/24.0.3/clang-include/lzcntintrin.h

@@ -0,0 +1,68 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__lzcnt16(unsigned short __X)
+{
+  return __X ? __builtin_clzs(__X) : 16;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__lzcnt64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */

diff --git a/24.0.3/clang-include/mm3dnow.h b/24.0.3/clang-include/mm3dnow.h
new file mode 100644
index 0000000..cb93faf
--- /dev/null
+++ b/24.0.3/clang-include/mm3dnow.h

@@ -0,0 +1,171 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_femms() {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/mm_malloc.h b/24.0.3/clang-include/mm_malloc.h
new file mode 100644
index 0000000..305afd3
--- /dev/null
+++ b/24.0.3/clang-include/mm_malloc.h

@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */

diff --git a/24.0.3/clang-include/mmintrin.h b/24.0.3/clang-include/mmintrin.h
new file mode 100644
index 0000000..162cb1a
--- /dev/null
+++ b/24.0.3/clang-include/mmintrin.h

@@ -0,0 +1,503 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_from_int64 _mm_cvtsi64_m64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_to_int64 _mm_cvtm64_si64
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMINTRIN_H */
+

diff --git a/24.0.3/clang-include/module.modulemap b/24.0.3/clang-include/module.modulemap
new file mode 100644
index 0000000..b147e89
--- /dev/null
+++ b/24.0.3/clang-include/module.modulemap

@@ -0,0 +1,171 @@
+module _Builtin_intrinsics [system] [extern_c] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module acle {
+      header "arm_acle.h"
+      export *
+    }
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    header "x86intrin.h"
+
+    explicit module mm_malloc {
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      header "mmintrin.h"
+    }
+
+    explicit module f16c {
+      header "f16cintrin.h"
+    }
+
+    explicit module sse {
+      export mmx
+      export sse2 // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module avx {
+      export sse4_2
+      header "avxintrin.h"
+    }
+
+    explicit module avx2 {
+      export avx
+      header "avx2intrin.h"
+    }
+
+    explicit module avx512f {
+      export avx2
+      header "avx512fintrin.h"
+    }
+
+    explicit module avx512er {
+      header "avx512erintrin.h"
+    }
+
+    explicit module bmi {
+      header "bmiintrin.h"
+    }
+
+    explicit module bmi2 {
+      header "bmi2intrin.h"
+    }
+
+    explicit module fma {
+      header "fmaintrin.h"
+    }
+
+    explicit module fma4 {
+      export sse3
+      header "fma4intrin.h"
+    }
+
+    explicit module lzcnt {
+      header "lzcntintrin.h"
+    }
+
+    explicit module popcnt {
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      header "mm3dnow.h"
+    }
+
+    explicit module xop {
+      export fma4
+      header "xopintrin.h"
+    }
+
+    explicit module aes_pclmul {
+      header "wmmintrin.h"
+      export aes
+      export pclmul
+    }
+
+    explicit module aes {
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+
+  explicit module systemz {
+    requires systemz
+    export *
+
+    header "s390intrin.h"
+
+    explicit module htm {
+      requires htm
+      header "htmintrin.h"
+      header "htmxlintrin.h"
+    }
+
+    explicit module zvector {
+      requires zvector, vx
+      header "vecintrin.h"
+    }
+  }
+}
+
+module _Builtin_stddef_max_align_t [system] [extern_c] {
+  header "__stddef_max_align_t.h"
+}

diff --git a/24.0.3/clang-include/nmmintrin.h b/24.0.3/clang-include/nmmintrin.h
new file mode 100644
index 0000000..57fec15
--- /dev/null
+++ b/24.0.3/clang-include/nmmintrin.h

@@ -0,0 +1,30 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* _NMMINTRIN_H */

diff --git a/24.0.3/clang-include/pmmintrin.h b/24.0.3/clang-include/pmmintrin.h
new file mode 100644
index 0000000..0ff9409
--- /dev/null
+++ b/24.0.3/clang-include/pmmintrin.h

@@ -0,0 +1,116 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd(__a, __b);
+}
+
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */

diff --git a/24.0.3/clang-include/popcntintrin.h b/24.0.3/clang-include/popcntintrin.h
new file mode 100644
index 0000000..6fcda65
--- /dev/null
+++ b/24.0.3/clang-include/popcntintrin.h

@@ -0,0 +1,58 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_popcnt32(int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_popcnt64(long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _POPCNTINTRIN_H */

diff --git a/24.0.3/clang-include/prfchwintrin.h b/24.0.3/clang-include/prfchwintrin.h
new file mode 100644
index 0000000..ba02857
--- /dev/null
+++ b/24.0.3/clang-include/prfchwintrin.h

@@ -0,0 +1,45 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */

diff --git a/24.0.3/clang-include/rdseedintrin.h b/24.0.3/clang-include/rdseedintrin.h
new file mode 100644
index 0000000..421f4ea
--- /dev/null
+++ b/24.0.3/clang-include/rdseedintrin.h

@@ -0,0 +1,56 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */

diff --git a/24.0.3/clang-include/rtmintrin.h b/24.0.3/clang-include/rtmintrin.h
new file mode 100644
index 0000000..e6a58d7
--- /dev/null
+++ b/24.0.3/clang-include/rtmintrin.h

@@ -0,0 +1,59 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */

diff --git a/24.0.3/clang-include/s390intrin.h b/24.0.3/clang-include/s390intrin.h
new file mode 100644
index 0000000..d51274c
--- /dev/null
+++ b/24.0.3/clang-include/s390intrin.h

@@ -0,0 +1,39 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#ifdef __VEC__
+#include <vecintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/

diff --git a/24.0.3/clang-include/shaintrin.h b/24.0.3/clang-include/shaintrin.h
new file mode 100644
index 0000000..9b5d218
--- /dev/null
+++ b/24.0.3/clang-include/shaintrin.h

@@ -0,0 +1,75 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SHAINTRIN_H */

diff --git a/24.0.3/clang-include/smmintrin.h b/24.0.3/clang-include/smmintrin.h
new file mode 100644
index 0000000..69ad07f
--- /dev/null
+++ b/24.0.3/clang-include/smmintrin.h

@@ -0,0 +1,508 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#include <tmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                 (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                  (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
+                                   (__v2df)(__m128d)(V2), \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
+                                   (__v8hi)(__m128i)(V2), \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                               (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128 (__m128i const *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(__m128)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__                           \
+                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                     __a[(N) & 15] = (I);                 \
+                                     __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__                         \
+                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                      __a[(N) & 3] = (I);                \
+                                      __a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__                         \
+                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                      __a[(N) & 1] = (I);                \
+                                      __a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__                           \
+                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__                         \
+                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                    (int)__a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__                         \
+                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                    (long long)__a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                      (__v16qi)(__m128i)(Y), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* Handle the sse4.2 definitions here. */
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) \
+  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                       (__v16qi)(__m128i)(B), (int)(LB), \
+                                       (int)(M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                   (__v16qi)(__m128i)(B), (int)(LB), \
+                                   (int)(M))
+
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrc(A, B, M) \
+  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistro(A, B, M) \
+  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrs(A, B, M) \
+  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrz(A, B, M) \
+  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* _SMMINTRIN_H */

diff --git a/24.0.3/clang-include/stdalign.h b/24.0.3/clang-include/stdalign.h
new file mode 100644
index 0000000..3738d12
--- /dev/null
+++ b/24.0.3/clang-include/stdalign.h

@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */

diff --git a/24.0.3/clang-include/stdarg.h b/24.0.3/clang-include/stdarg.h
new file mode 100644
index 0000000..a57e183
--- /dev/null
+++ b/24.0.3/clang-include/stdarg.h

@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */

diff --git a/24.0.3/clang-include/stdatomic.h b/24.0.3/clang-include/stdatomic.h
new file mode 100644
index 0000000..e037987
--- /dev/null
+++ b/24.0.3/clang-include/stdatomic.h

@@ -0,0 +1,190 @@
+/*===---- stdatomic.h - Standard header for atomic types and operations -----===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_STDATOMIC_H
+#define __CLANG_STDATOMIC_H
+
+/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
+ * example, already has a Clang-compatible stdatomic.h header.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
+# include_next <stdatomic.h>
+#else
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 7.17.1 Introduction */
+
+#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_T_LOCK_FREE    __GCC_ATOMIC_SHORT_T_LOCK_FREE
+#define ATOMIC_INT_T_LOCK_FREE      __GCC_ATOMIC_INT_T_LOCK_FREE
+#define ATOMIC_LONG_T_LOCK_FREE     __GCC_ATOMIC_LONG_T_LOCK_FREE
+#define ATOMIC_LLONG_T_LOCK_FREE    __GCC_ATOMIC_LLONG_T_LOCK_FREE
+#define ATOMIC_POINTER_T_LOCK_FREE  __GCC_ATOMIC_POINTER_T_LOCK_FREE
+
+/* 7.17.2 Initialization */
+
+#define ATOMIC_VAR_INIT(value) (value)
+#define atomic_init __c11_atomic_init
+
+/* 7.17.3 Order and consistency */
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#define kill_dependency(y) (y)
+
+/* 7.17.4 Fences */
+
+/* These should be provided by the libc implementation. */
+void atomic_thread_fence(memory_order);
+void atomic_signal_fence(memory_order);
+
+#define atomic_thread_fence(order) __c11_atomic_thread_fence(order)
+#define atomic_signal_fence(order) __c11_atomic_signal_fence(order)
+
+/* 7.17.5 Lock-free property */
+
+#define atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
+
+/* 7.17.6 Atomic integer types */
+
+#ifdef __cplusplus
+typedef _Atomic(bool)               atomic_bool;
+#else
+typedef _Atomic(_Bool)              atomic_bool;
+#endif
+typedef _Atomic(char)               atomic_char;
+typedef _Atomic(signed char)        atomic_schar;
+typedef _Atomic(unsigned char)      atomic_uchar;
+typedef _Atomic(short)              atomic_short;
+typedef _Atomic(unsigned short)     atomic_ushort;
+typedef _Atomic(int)                atomic_int;
+typedef _Atomic(unsigned int)       atomic_uint;
+typedef _Atomic(long)               atomic_long;
+typedef _Atomic(unsigned long)      atomic_ulong;
+typedef _Atomic(long long)          atomic_llong;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(uint_least16_t)     atomic_char16_t;
+typedef _Atomic(uint_least32_t)     atomic_char32_t;
+typedef _Atomic(wchar_t)            atomic_wchar_t;
+typedef _Atomic(int_least8_t)       atomic_int_least8_t;
+typedef _Atomic(uint_least8_t)      atomic_uint_least8_t;
+typedef _Atomic(int_least16_t)      atomic_int_least16_t;
+typedef _Atomic(uint_least16_t)     atomic_uint_least16_t;
+typedef _Atomic(int_least32_t)      atomic_int_least32_t;
+typedef _Atomic(uint_least32_t)     atomic_uint_least32_t;
+typedef _Atomic(int_least64_t)      atomic_int_least64_t;
+typedef _Atomic(uint_least64_t)     atomic_uint_least64_t;
+typedef _Atomic(int_fast8_t)        atomic_int_fast8_t;
+typedef _Atomic(uint_fast8_t)       atomic_uint_fast8_t;
+typedef _Atomic(int_fast16_t)       atomic_int_fast16_t;
+typedef _Atomic(uint_fast16_t)      atomic_uint_fast16_t;
+typedef _Atomic(int_fast32_t)       atomic_int_fast32_t;
+typedef _Atomic(uint_fast32_t)      atomic_uint_fast32_t;
+typedef _Atomic(int_fast64_t)       atomic_int_fast64_t;
+typedef _Atomic(uint_fast64_t)      atomic_uint_fast64_t;
+typedef _Atomic(intptr_t)           atomic_intptr_t;
+typedef _Atomic(uintptr_t)          atomic_uintptr_t;
+typedef _Atomic(size_t)             atomic_size_t;
+typedef _Atomic(ptrdiff_t)          atomic_ptrdiff_t;
+typedef _Atomic(intmax_t)           atomic_intmax_t;
+typedef _Atomic(uintmax_t)          atomic_uintmax_t;
+
+/* 7.17.7 Operations on atomic types */
+
+#define atomic_store(object, desired) __c11_atomic_store(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_store_explicit __c11_atomic_store
+
+#define atomic_load(object) __c11_atomic_load(object, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit __c11_atomic_load
+
+#define atomic_exchange(object, desired) __c11_atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_exchange_explicit __c11_atomic_exchange
+
+#define atomic_compare_exchange_strong(object, expected, desired) __c11_atomic_compare_exchange_strong(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+
+#define atomic_compare_exchange_weak(object, expected, desired) __c11_atomic_compare_exchange_weak(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+
+#define atomic_fetch_sub(object, operand) __c11_atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+
+#define atomic_fetch_or(object, operand) __c11_atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+
+#define atomic_fetch_xor(object, operand) __c11_atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+
+#define atomic_fetch_and(object, operand) __c11_atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+
+/* 7.17.8 Atomic flag type and operations */
+
+typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
+
+#define ATOMIC_FLAG_INIT { 0 }
+
+/* These should be provided by the libc implementation. */
+#ifdef __cplusplus
+bool atomic_flag_test_and_set(volatile atomic_flag *);
+bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#else
+_Bool atomic_flag_test_and_set(volatile atomic_flag *);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#endif
+void atomic_flag_clear(volatile atomic_flag *);
+void atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
+
+#define atomic_flag_test_and_set(object) __c11_atomic_exchange(&(object)->_Value, 1, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set_explicit(object, order) __c11_atomic_exchange(&(object)->_Value, 1, order)
+
+#define atomic_flag_clear(object) __c11_atomic_store(&(object)->_Value, 0, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear_explicit(object, order) __c11_atomic_store(&(object)->_Value, 0, order)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDATOMIC_H */
+

diff --git a/24.0.3/clang-include/stdbool.h b/24.0.3/clang-include/stdbool.h
new file mode 100644
index 0000000..0467893
--- /dev/null
+++ b/24.0.3/clang-include/stdbool.h

@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */

diff --git a/24.0.3/clang-include/stddef.h b/24.0.3/clang-include/stddef.h
new file mode 100644
index 0000000..7354996
--- /dev/null
+++ b/24.0.3/clang-include/stddef.h

@@ -0,0 +1,137 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+/* Always define miscellaneous pieces when modules are available. */
+#if !__has_feature(modules)
+#define __STDDEF_H
+#endif
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+#define __need_STDDEF_H_misc
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__need_STDDEF_H_misc)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__need_STDDEF_H_misc) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__need_STDDEF_H_misc)
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#include "__stddef_max_align_t.h"
+#endif
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#undef __need_STDDEF_H_misc
+#endif  /* defined(__need_STDDEF_H_misc) */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif

diff --git a/24.0.3/clang-include/stdint.h b/24.0.3/clang-include/stdint.h
new file mode 100644
index 0000000..3f2fcbc
--- /dev/null
+++ b/24.0.3/clang-include/stdint.h

@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types.
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef __UINT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef __INT56_TYPE__ int56_t;
+typedef __UINT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef __INT48_TYPE__ int48_t;
+typedef __UINT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef __INT40_TYPE__ int40_t;
+typedef __UINT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef __UINT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef __INT24_TYPE__ int24_t;
+typedef __UINT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef __UINT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef __UINT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types.
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */

diff --git a/24.0.3/clang-include/stdnoreturn.h b/24.0.3/clang-include/stdnoreturn.h
new file mode 100644
index 0000000..a7a301d
--- /dev/null
+++ b/24.0.3/clang-include/stdnoreturn.h

@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */

diff --git a/24.0.3/clang-include/tbmintrin.h b/24.0.3/clang-include/tbmintrin.h
new file mode 100644
index 0000000..785961c
--- /dev/null
+++ b/24.0.3/clang-include/tbmintrin.h

@@ -0,0 +1,154 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+
+#define __bextri_u32(a, b) \
+  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
+                                           (unsigned int)(b)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcfill_u32(unsigned int a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blci_u32(unsigned int a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcic_u32(unsigned int a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcmsk_u32(unsigned int a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcs_u32(unsigned int a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsfill_u32(unsigned int a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsic_u32(unsigned int a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__t1mskc_u32(unsigned int a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__tzmsk_u32(unsigned int a)
+{
+  return ~a & (a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) \
+  ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
+                                                 (unsigned long long)(b)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcfill_u64(unsigned long long a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blci_u64(unsigned long long a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcic_u64(unsigned long long a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcmsk_u64(unsigned long long a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcs_u64(unsigned long long a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsfill_u64(unsigned long long a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsic_u64(unsigned long long a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__t1mskc_u64(unsigned long long a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__tzmsk_u64(unsigned long long a)
+{
+  return ~a & (a - 1);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TBMINTRIN_H */

diff --git a/24.0.3/clang-include/tgmath.h b/24.0.3/clang-include/tgmath.h
new file mode 100644
index 0000000..318e118
--- /dev/null
+++ b/24.0.3/clang-include/tgmath.h

@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y)
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */

diff --git a/24.0.3/clang-include/tmmintrin.h b/24.0.3/clang-include/tmmintrin.h
new file mode 100644
index 0000000..0002890
--- /dev/null
+++ b/24.0.3/clang-include/tmmintrin.h

@@ -0,0 +1,221 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                     (__v16qi)(__m128i)(b), (n)); })
+
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TMMINTRIN_H */

diff --git a/24.0.3/clang-include/unwind.h b/24.0.3/clang-include/unwind.h
new file mode 100644
index 0000000..303d792
--- /dev/null
+++ b/24.0.3/clang-include/unwind.h

@@ -0,0 +1,282 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if defined(__APPLE__) && __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+_Unwind_Word _Unwind_GetBSP(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info(const void *, void *) __attribute__((__unavailable__));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_table(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */

diff --git a/24.0.3/clang-include/vadefs.h b/24.0.3/clang-include/vadefs.h
new file mode 100644
index 0000000..7fe9a74
--- /dev/null
+++ b/24.0.3/clang-include/vadefs.h

@@ -0,0 +1,65 @@
+/* ===-------- vadefs.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we are aiming for MSVC compatibility. */
+#ifndef _MSC_VER
+#include_next <vadefs.h>
+#else
+
+#ifndef __clang_vadefs_h
+#define __clang_vadefs_h
+
+#include_next <vadefs.h>
+
+/* Override macros from vadefs.h with definitions that work with Clang. */
+#ifdef _crt_va_start
+#undef _crt_va_start
+#define _crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef _crt_va_end
+#undef _crt_va_end
+#define _crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef _crt_va_arg
+#undef _crt_va_arg
+#define _crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+/* VS 2015 switched to double underscore names, which is an improvement, but now
+ * we have to intercept those names too.
+ */
+#ifdef __crt_va_start
+#undef __crt_va_start
+#define __crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef __crt_va_end
+#undef __crt_va_end
+#define __crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef __crt_va_arg
+#undef __crt_va_arg
+#define __crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+#endif
+#endif

diff --git a/24.0.3/clang-include/varargs.h b/24.0.3/clang-include/varargs.h
new file mode 100644
index 0000000..b5477d0
--- /dev/null
+++ b/24.0.3/clang-include/varargs.h

@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif

diff --git a/24.0.3/clang-include/vecintrin.h b/24.0.3/clang-include/vecintrin.h
new file mode 100644
index 0000000..ca7acb4
--- /dev/null
+++ b/24.0.3/clang-include/vecintrin.h

@@ -0,0 +1,8946 @@
+/*===---- vecintrin.h - Vector intrinsics ----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if defined(__s390x__) && defined(__VEC__)
+
+#define __ATTRS_ai __attribute__((__always_inline__))
+#define __ATTRS_o __attribute__((__overloadable__))
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#define __constant(PARM) \
+  __attribute__((__enable_if__ ((PARM) == (PARM), \
+     "argument must be a constant integer")))
+#define __constant_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH), \
+     "argument must be a constant integer from " #LOW " to " #HIGH)))
+#define __constant_pow2_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH) && \
+                                ((PARM) & ((PARM) - 1)) == 0, \
+     "argument must be a constant power of 2 from " #LOW " to " #HIGH)))
+
+/*-- __lcbb -----------------------------------------------------------------*/
+
+extern __ATTRS_o unsigned int
+__lcbb(const void *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define __lcbb(X, Y) ((__typeof__((__lcbb)((X), (Y)))) \
+  __builtin_s390_lcbb((X), __builtin_constant_p((Y))? \
+                           ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : 0) : 0))
+
+/*-- vec_extract ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai signed char
+vec_extract(vector signed char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector bool char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector unsigned char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai signed short
+vec_extract(vector signed short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector bool short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector unsigned short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai signed int
+vec_extract(vector signed int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector bool int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector unsigned int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai signed long long
+vec_extract(vector signed long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector bool long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector unsigned long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai double
+vec_extract(vector double __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+/*-- vec_insert -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert(signed char __scalar, vector signed char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
+  vector unsigned char __newvec = (vector unsigned char)__vec;
+  __newvec[__index & 15] = (unsigned char)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector unsigned char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert(signed short __scalar, vector signed short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
+  vector unsigned short __newvec = (vector unsigned short)__vec;
+  __newvec[__index & 7] = (unsigned short)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector unsigned short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert(signed int __scalar, vector signed int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
+  vector unsigned int __newvec = (vector unsigned int)__vec;
+  __newvec[__index & 3] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector unsigned int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert(signed long long __scalar, vector signed long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector bool long long __vec,
+           int __index) {
+  vector unsigned long long __newvec = (vector unsigned long long)__vec;
+  __newvec[__index & 1] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert(double __scalar, vector double __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_promote ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_promote(signed char __scalar, int __index) {
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_promote(unsigned char __scalar, int __index) {
+  const vector unsigned char __zero = (vector unsigned char)0;
+  vector unsigned char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_promote(signed short __scalar, int __index) {
+  const vector signed short __zero = (vector signed short)0;
+  vector signed short __vec = __builtin_shufflevector(__zero, __zero,
+                                -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_promote(unsigned short __scalar, int __index) {
+  const vector unsigned short __zero = (vector unsigned short)0;
+  vector unsigned short __vec = __builtin_shufflevector(__zero, __zero,
+                                  -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_promote(signed int __scalar, int __index) {
+  const vector signed int __zero = (vector signed int)0;
+  vector signed int __vec = __builtin_shufflevector(__zero, __zero,
+                                                    -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_promote(unsigned int __scalar, int __index) {
+  const vector unsigned int __zero = (vector unsigned int)0;
+  vector unsigned int __vec = __builtin_shufflevector(__zero, __zero,
+                                                      -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_promote(signed long long __scalar, int __index) {
+  const vector signed long long __zero = (vector signed long long)0;
+  vector signed long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                          -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_promote(unsigned long long __scalar, int __index) {
+  const vector unsigned long long __zero = (vector unsigned long long)0;
+  vector unsigned long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                            -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_promote(double __scalar, int __index) {
+  const vector double __zero = (vector double)0;
+  vector double __vec = __builtin_shufflevector(__zero, __zero, -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_insert_and_zero ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert_and_zero(const signed char *__ptr) {
+  vector signed char __vec = (vector signed char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert_and_zero(const unsigned char *__ptr) {
+  vector unsigned char __vec = (vector unsigned char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert_and_zero(const signed short *__ptr) {
+  vector signed short __vec = (vector signed short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert_and_zero(const unsigned short *__ptr) {
+  vector unsigned short __vec = (vector unsigned short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert_and_zero(const signed int *__ptr) {
+  vector signed int __vec = (vector signed int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert_and_zero(const unsigned int *__ptr) {
+  vector unsigned int __vec = (vector unsigned int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert_and_zero(const signed long long *__ptr) {
+  vector signed long long __vec = (vector signed long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert_and_zero(const unsigned long long *__ptr) {
+  vector unsigned long long __vec = (vector unsigned long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert_and_zero(const double *__ptr) {
+  vector double __vec = (vector double)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+/*-- vec_perm ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_perm(vector signed char __a, vector signed char __b,
+         vector unsigned char __c) {
+  return (vector signed char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (vector unsigned char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_perm(vector bool char __a, vector bool char __b,
+         vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_perm(vector signed short __a, vector signed short __b,
+         vector unsigned char __c) {
+  return (vector signed short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+  return (vector unsigned short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_perm(vector bool short __a, vector bool short __b,
+         vector unsigned char __c) {
+  return (vector bool short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_perm(vector signed int __a, vector signed int __b,
+         vector unsigned char __c) {
+  return (vector signed int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+  return (vector unsigned int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_perm(vector bool int __a, vector bool int __b,
+         vector unsigned char __c) {
+  return (vector bool int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+  return (vector signed long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+  return (vector unsigned long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+  return (vector bool long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_perm(vector double __a, vector double __b,
+         vector unsigned char __c) {
+  return (vector double)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+/*-- vec_permi --------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed long long
+vec_permi(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector bool long long
+vec_permi(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_permi(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_permi(X, Y, Z) ((__typeof__((vec_permi)((X), (Y), (Z)))) \
+  __builtin_s390_vpdi((vector unsigned long long)(X), \
+                      (vector unsigned long long)(Y), \
+                      (((Z) & 2) << 1) | ((Z) & 1)))
+
+/*-- vec_sel ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b,
+        vector unsigned char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return ((vector bool char)__c & __b) | (~(vector bool char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector bool char __c) {
+  return ((vector unsigned char)__c & __b) | (~(vector unsigned char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector unsigned short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector bool short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b,
+        vector unsigned short __c) {
+  return ((vector bool short)__c & __b) | (~(vector bool short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (((vector unsigned short)__c & __b) |
+          (~(vector unsigned short)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b,
+        vector unsigned int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b, vector bool int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return ((vector bool int)__c & __b) | (~(vector bool int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b,
+        vector unsigned int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return ((vector unsigned int)__c & __b) | (~(vector unsigned int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector unsigned long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector bool long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector unsigned long long __c) {
+  return (((vector bool long long)__c & __b) |
+          (~(vector bool long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector bool long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector unsigned long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector bool long long __c) {
+  return (((vector unsigned long long)__c & __b) |
+          (~(vector unsigned long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  return (vector double)((__c & (vector unsigned long long)__b) |
+                         (~__c & (vector unsigned long long)__a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  vector unsigned long long __cc = (vector unsigned long long)__c;
+  return (vector double)((__cc & __bc) | (~__cc & __ac));
+}
+
+/*-- vec_gather_element -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed int
+vec_gather_element(vector signed int __vec, vector unsigned int __offset,
+                   const signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const signed int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_gather_element(vector bool int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gather_element(vector unsigned int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_gather_element(vector signed long long __vec,
+                   vector unsigned long long __offset,
+                   const signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const signed long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_gather_element(vector bool long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gather_element(vector unsigned long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_gather_element(vector double __vec, vector unsigned long long __offset,
+                   const double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const double *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+/*-- vec_scatter_element ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed int __vec, vector unsigned int __offset,
+                    signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(signed int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed long long __vec,
+                    vector unsigned long long __offset,
+                    signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(signed long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector double __vec, vector unsigned long long __offset,
+                    double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(double *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+/*-- vec_xld2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xld2(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xld2(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xld2(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xld2(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xld2(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xld2(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xld2(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xld2(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_xld2(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xlw4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xlw4(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xlw4(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xlw4(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xlw4(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xlw4(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xlw4(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xstd2 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_xstw4 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_load_bndry ---------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_load_bndry(const signed char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned char
+vec_load_bndry(const unsigned char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed short
+vec_load_bndry(const signed short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned short
+vec_load_bndry(const unsigned short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed int
+vec_load_bndry(const signed int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned int
+vec_load_bndry(const unsigned int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed long long
+vec_load_bndry(const signed long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned long long
+vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector double
+vec_load_bndry(const double *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define vec_load_bndry(X, Y) ((__typeof__((vec_load_bndry)((X), (Y)))) \
+  __builtin_s390_vlbb((X), ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : -1)))
+
+/*-- vec_load_len -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_load_len(const signed char *__ptr, unsigned int __len) {
+  return (vector signed char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_load_len(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_load_len(const signed short *__ptr, unsigned int __len) {
+  return (vector signed short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_load_len(const unsigned short *__ptr, unsigned int __len) {
+  return (vector unsigned short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_load_len(const signed int *__ptr, unsigned int __len) {
+  return (vector signed int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_load_len(const unsigned int *__ptr, unsigned int __len) {
+  return (vector unsigned int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_len(const signed long long *__ptr, unsigned int __len) {
+  return (vector signed long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
+  return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_load_len(const double *__ptr, unsigned int __len) {
+  return (vector double)__builtin_s390_vll(__len, __ptr);
+}
+
+/*-- vec_store_len ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed char __vec, signed char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned char __vec, unsigned char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed short __vec, signed short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned short __vec, unsigned short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed int __vec, signed int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned int __vec, unsigned int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed long long __vec, signed long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector double __vec, double *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+/*-- vec_load_pair ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_pair(signed long long __a, signed long long __b) {
+  return (vector signed long long)(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_pair(unsigned long long __a, unsigned long long __b) {
+  return (vector unsigned long long)(__a, __b);
+}
+
+/*-- vec_genmask ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmask(unsigned short __mask)
+  __constant(__mask) {
+  return (vector unsigned char)(
+    __mask & 0x8000 ? 0xff : 0,
+    __mask & 0x4000 ? 0xff : 0,
+    __mask & 0x2000 ? 0xff : 0,
+    __mask & 0x1000 ? 0xff : 0,
+    __mask & 0x0800 ? 0xff : 0,
+    __mask & 0x0400 ? 0xff : 0,
+    __mask & 0x0200 ? 0xff : 0,
+    __mask & 0x0100 ? 0xff : 0,
+    __mask & 0x0080 ? 0xff : 0,
+    __mask & 0x0040 ? 0xff : 0,
+    __mask & 0x0020 ? 0xff : 0,
+    __mask & 0x0010 ? 0xff : 0,
+    __mask & 0x0008 ? 0xff : 0,
+    __mask & 0x0004 ? 0xff : 0,
+    __mask & 0x0002 ? 0xff : 0,
+    __mask & 0x0001 ? 0xff : 0);
+}
+
+/*-- vec_genmasks_* ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmasks_8(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 7;
+  unsigned char __bit2 = __last & 7;
+  unsigned char __mask1 = (unsigned char)(1U << (7 - __bit1) << 1) - 1;
+  unsigned char __mask2 = (unsigned char)(1U << (7 - __bit2)) - 1;
+  unsigned char __value = (__bit1 <= __bit2 ?
+                           __mask1 & ~__mask2 :
+                           __mask1 | ~__mask2);
+  return (vector unsigned char)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_genmasks_16(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 15;
+  unsigned char __bit2 = __last & 15;
+  unsigned short __mask1 = (unsigned short)(1U << (15 - __bit1) << 1) - 1;
+  unsigned short __mask2 = (unsigned short)(1U << (15 - __bit2)) - 1;
+  unsigned short __value = (__bit1 <= __bit2 ?
+                            __mask1 & ~__mask2 :
+                            __mask1 | ~__mask2);
+  return (vector unsigned short)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_genmasks_32(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 31;
+  unsigned char __bit2 = __last & 31;
+  unsigned int __mask1 = (1U << (31 - __bit1) << 1) - 1;
+  unsigned int __mask2 = (1U << (31 - __bit2)) - 1;
+  unsigned int __value = (__bit1 <= __bit2 ?
+                          __mask1 & ~__mask2 :
+                          __mask1 | ~__mask2);
+  return (vector unsigned int)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_genmasks_64(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 63;
+  unsigned char __bit2 = __last & 63;
+  unsigned long long __mask1 = (1ULL << (63 - __bit1) << 1) - 1;
+  unsigned long long __mask2 = (1ULL << (63 - __bit2)) - 1;
+  unsigned long long __value = (__bit1 <= __bit2 ?
+                                __mask1 & ~__mask2 :
+                                __mask1 | ~__mask2);
+  return (vector unsigned long long)__value;
+}
+
+/*-- vec_splat --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splat(vector signed char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector signed char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_splat(vector bool char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector bool char)(vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splat(vector unsigned char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splat(vector signed short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector signed short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_splat(vector bool short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector bool short)(vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splat(vector unsigned short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splat(vector signed int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector signed int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_splat(vector bool int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector bool int)(vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splat(vector unsigned int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splat(vector signed long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector signed long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_splat(vector bool long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector bool long long)(vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splat(vector unsigned long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splat(vector double __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector double)__vec[__index];
+}
+
+/*-- vec_splat_s* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector signed char
+vec_splat_s8(signed char __scalar)
+  __constant(__scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_ai vector signed short
+vec_splat_s16(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_ai vector signed int
+vec_splat_s32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector signed long long
+vec_splat_s64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed long long)(signed long)__scalar;
+}
+
+/*-- vec_splat_u* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_splat_u8(unsigned char __scalar)
+  __constant(__scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned short
+vec_splat_u16(unsigned short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned int
+vec_splat_u32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned long long
+vec_splat_u64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned long long)(signed long long)__scalar;
+}
+
+/*-- vec_splats -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splats(signed char __scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splats(unsigned char __scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splats(signed short __scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splats(unsigned short __scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splats(signed int __scalar) {
+  return (vector signed int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splats(unsigned int __scalar) {
+  return (vector unsigned int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splats(signed long long __scalar) {
+  return (vector signed long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splats(unsigned long long __scalar) {
+  return (vector unsigned long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splats(double __scalar) {
+  return (vector double)__scalar;
+}
+
+/*-- vec_extend_s64 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed char __a) {
+  return (vector signed long long)(__a[7], __a[15]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed short __a) {
+  return (vector signed long long)(__a[3], __a[7]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed int __a) {
+  return (vector signed long long)(__a[1], __a[3]);
+}
+
+/*-- vec_mergeh -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergeh(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergeh(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergeh(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergeh(vector double __a, vector double __b) {
+  return (vector double)(__a[0], __b[0]);
+}
+
+/*-- vec_mergel -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergel(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergel(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergel(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergel(vector double __a, vector double __b) {
+  return (vector double)(__a[1], __b[1]);
+}
+
+/*-- vec_pack ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_pack(vector signed short __a, vector signed short __b) {
+  vector signed char __ac = (vector signed char)__a;
+  vector signed char __bc = (vector signed char)__b;
+  return (vector signed char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_pack(vector bool short __a, vector bool short __b) {
+  vector bool char __ac = (vector bool char)__a;
+  vector bool char __bc = (vector bool char)__b;
+  return (vector bool char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return (vector unsigned char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_pack(vector signed int __a, vector signed int __b) {
+  vector signed short __ac = (vector signed short)__a;
+  vector signed short __bc = (vector signed short)__b;
+  return (vector signed short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_pack(vector bool int __a, vector bool int __b) {
+  vector bool short __ac = (vector bool short)__a;
+  vector bool short __bc = (vector bool short)__b;
+  return (vector bool short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return (vector unsigned short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_pack(vector signed long long __a, vector signed long long __b) {
+  vector signed int __ac = (vector signed int)__a;
+  vector signed int __bc = (vector signed int)__b;
+  return (vector signed int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_pack(vector bool long long __a, vector bool long long __b) {
+  vector bool int __ac = (vector bool int)__a;
+  vector bool int __bc = (vector bool int)__b;
+  return (vector bool int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return (vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+/*-- vec_packs --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vpksh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vpksf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vpksg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packs_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return __builtin_s390_vpkshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return __builtin_s390_vpksfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs_cc(vector signed long long __a, vector signed long long __b,
+             int *__cc) {
+  return __builtin_s390_vpksgs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs_cc(vector unsigned long long __a, vector unsigned long long __b,
+             int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_packsu -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector signed short __a, vector signed short __b) {
+  const vector signed short __zero = (vector signed short)0;
+  return __builtin_s390_vpklsh(
+    (vector unsigned short)(__a >= __zero) & (vector unsigned short)__a,
+    (vector unsigned short)(__b >= __zero) & (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector signed int __a, vector signed int __b) {
+  const vector signed int __zero = (vector signed int)0;
+  return __builtin_s390_vpklsf(
+    (vector unsigned int)(__a >= __zero) & (vector unsigned int)__a,
+    (vector unsigned int)(__b >= __zero) & (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector signed long long __a, vector signed long long __b) {
+  const vector signed long long __zero = (vector signed long long)0;
+  return __builtin_s390_vpklsg(
+    (vector unsigned long long)(__a >= __zero) &
+    (vector unsigned long long)__a,
+    (vector unsigned long long)(__b >= __zero) &
+    (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packsu_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu_cc(vector unsigned long long __a, vector unsigned long long __b,
+              int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_unpackh ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackh(vector signed char __a) {
+  return __builtin_s390_vuphb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackh(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuphb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackh(vector unsigned char __a) {
+  return __builtin_s390_vuplhb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackh(vector signed short __a) {
+  return __builtin_s390_vuphh(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackh(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuphh((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackh(vector unsigned short __a) {
+  return __builtin_s390_vuplhh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackh(vector signed int __a) {
+  return __builtin_s390_vuphf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackh(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuphf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackh(vector unsigned int __a) {
+  return __builtin_s390_vuplhf(__a);
+}
+
+/*-- vec_unpackl ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackl(vector signed char __a) {
+  return __builtin_s390_vuplb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackl(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuplb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackl(vector unsigned char __a) {
+  return __builtin_s390_vupllb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackl(vector signed short __a) {
+  return __builtin_s390_vuplhw(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackl(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuplhw((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackl(vector unsigned short __a) {
+  return __builtin_s390_vupllh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackl(vector signed int __a) {
+  return __builtin_s390_vuplf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackl(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuplf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackl(vector unsigned int __a) {
+  return __builtin_s390_vupllf(__a);
+}
+
+/*-- vec_cmpeq --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+/*-- vec_cmpge --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+/*-- vec_cmpgt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+/*-- vec_cmple --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector double __a, vector double __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+/*-- vec_cmplt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+/*-- vec_all_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_any_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 3;
+}
+
+/*-- vec_any_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_andc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_andc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector bool char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector bool short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_andc(vector bool int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector bool int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+/*-- vec_nor ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_nor(vector bool char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector bool char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector bool char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector bool short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector bool short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nor(vector bool int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector bool int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector bool int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector bool long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector bool long long __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector bool long long __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+/*-- vec_cntlz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector signed char __a) {
+  return __builtin_s390_vclzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_s390_vclzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector signed short __a) {
+  return __builtin_s390_vclzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_s390_vclzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector signed int __a) {
+  return __builtin_s390_vclzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_s390_vclzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector signed long long __a) {
+  return __builtin_s390_vclzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_s390_vclzg(__a);
+}
+
+/*-- vec_cnttz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector signed char __a) {
+  return __builtin_s390_vctzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_s390_vctzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector signed short __a) {
+  return __builtin_s390_vctzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_s390_vctzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector signed int __a) {
+  return __builtin_s390_vctzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_s390_vctzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector signed long long __a) {
+  return __builtin_s390_vctzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_s390_vctzg(__a);
+}
+
+/*-- vec_popcnt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector signed char __a) {
+  return __builtin_s390_vpopctb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_s390_vpopctb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector signed short __a) {
+  return __builtin_s390_vpopcth((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_s390_vpopcth(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector signed int __a) {
+  return __builtin_s390_vpopctf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_s390_vpopctf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector signed long long __a) {
+  return __builtin_s390_vpopctg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_s390_vpopctg(__a);
+}
+
+/*-- vec_rl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_verllvb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_verllvb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_verllvh(
+    (vector unsigned short)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_verllvh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_verllvf(
+    (vector unsigned int)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_verllvf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_verllvg(
+    (vector unsigned long long)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_verllvg(__a, __b);
+}
+
+/*-- vec_rli ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rli(vector signed char __a, unsigned long __b) {
+  return (vector signed char)__builtin_s390_verllb(
+    (vector unsigned char)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rli(vector unsigned char __a, unsigned long __b) {
+  return __builtin_s390_verllb(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rli(vector signed short __a, unsigned long __b) {
+  return (vector signed short)__builtin_s390_verllh(
+    (vector unsigned short)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rli(vector unsigned short __a, unsigned long __b) {
+  return __builtin_s390_verllh(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rli(vector signed int __a, unsigned long __b) {
+  return (vector signed int)__builtin_s390_verllf(
+    (vector unsigned int)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rli(vector unsigned int __a, unsigned long __b) {
+  return __builtin_s390_verllf(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rli(vector signed long long __a, unsigned long __b) {
+  return (vector signed long long)__builtin_s390_verllg(
+    (vector unsigned long long)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rli(vector unsigned long long __a, unsigned long __b) {
+  return __builtin_s390_verllg(__a, (int)__b);
+}
+
+/*-- vec_rl_mask ------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_rl_mask(vector signed char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned char
+vec_rl_mask(vector unsigned char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed short
+vec_rl_mask(vector signed short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned short
+vec_rl_mask(vector unsigned short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed int
+vec_rl_mask(vector signed int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned int
+vec_rl_mask(vector unsigned int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed long long
+vec_rl_mask(vector signed long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned long long
+vec_rl_mask(vector unsigned long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+#define vec_rl_mask(X, Y, Z) ((__typeof__((vec_rl_mask)((X), (Y), (Z)))) \
+  __extension__ ({ \
+    vector unsigned char __res; \
+    vector unsigned char __x = (vector unsigned char)(X); \
+    vector unsigned char __y = (vector unsigned char)(Y); \
+    switch (sizeof ((X)[0])) { \
+    case 1: __res = (vector unsigned char) __builtin_s390_verimb( \
+             (vector unsigned char)__x, (vector unsigned char)__x, \
+             (vector unsigned char)__y, (Z)); break; \
+    case 2: __res = (vector unsigned char) __builtin_s390_verimh( \
+             (vector unsigned short)__x, (vector unsigned short)__x, \
+             (vector unsigned short)__y, (Z)); break; \
+    case 4: __res = (vector unsigned char) __builtin_s390_verimf( \
+             (vector unsigned int)__x, (vector unsigned int)__x, \
+             (vector unsigned int)__y, (Z)); break; \
+    default: __res = (vector unsigned char) __builtin_s390_verimg( \
+             (vector unsigned long long)__x, (vector unsigned long long)__x, \
+             (vector unsigned long long)__y, (Z)); break; \
+    } __res; }))
+
+/*-- vec_sll ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_slb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vslb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vslb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_sld ----------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sld(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned char
+vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed short
+vec_sld(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned short
+vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed int
+vec_sld(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned int
+vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed long long
+vec_sld(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned long long
+vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector double
+vec_sld(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 15);
+
+#define vec_sld(X, Y, Z) ((__typeof__((vec_sld)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z)))
+
+/*-- vec_sldw ---------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sldw(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned char
+vec_sldw(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed short
+vec_sldw(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned short
+vec_sldw(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed int
+vec_sldw(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned int
+vec_sldw(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed long long
+vec_sldw(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_sldw(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_sldw(X, Y, Z) ((__typeof__((vec_sldw)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z) * 4))
+
+/*-- vec_sral ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsra(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srab ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrab(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrab(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srl ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrlb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrlb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_abs ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_abs(vector signed char __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed char)0));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_abs(vector signed short __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed short)0));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_abs(vector signed int __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed int)0));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_abs(vector signed long long __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_abs(vector double __a) {
+  return __builtin_s390_vflpdb(__a);
+}
+
+/*-- vec_nabs ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_nabs(vector double __a) {
+  return __builtin_s390_vflndb(__a);
+}
+
+/*-- vec_max ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector signed char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector signed short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector signed int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_max(vector double __a, vector double __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_min ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector signed char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector signed short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector signed int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_min(vector double __a, vector double __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_add_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_add_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaq(__a, __b);
+}
+
+/*-- vec_addc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_addc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_addc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vacch(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vaccf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_addc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vaccg(__a, __b);
+}
+
+/*-- vec_addc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccq(__a, __b);
+}
+
+/*-- vec_adde_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_adde_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vacq(__a, __b, __c);
+}
+
+/*-- vec_addec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vacccq(__a, __b, __c);
+}
+
+/*-- vec_avg ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vavgb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_avg(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vavgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_avg(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vavgf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_avg(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vavgg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vavglb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vavglh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vavglf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_avg(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vavglg(__a, __b);
+}
+
+/*-- vec_checksum -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned int
+vec_checksum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vcksm(__a, __b);
+}
+
+/*-- vec_gfmsum -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vgfmb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vgfmh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vgfmf(__a, __b);
+}
+
+/*-- vec_gfmsum_128 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vgfmg(__a, __b);
+}
+
+/*-- vec_gfmsum_accum -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum_accum(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned short __c) {
+  return __builtin_s390_vgfmab(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum_accum(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned int __c) {
+  return __builtin_s390_vgfmah(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum_accum(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned long long __c) {
+  return __builtin_s390_vgfmaf(__a, __b, __c);
+}
+
+/*-- vec_gfmsum_accum_128 ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_accum_128(vector unsigned long long __a,
+                     vector unsigned long long __b,
+                     vector unsigned char __c) {
+  return __builtin_s390_vgfmag(__a, __b, __c);
+}
+
+/*-- vec_mladd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector unsigned char __a, vector signed char __b,
+          vector signed char __c) {
+  return (vector signed char)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * (vector signed char)__b + (vector signed char)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mladd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector unsigned short __a, vector signed short __b,
+          vector signed short __c) {
+  return (vector signed short)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * (vector signed short)__b + (vector signed short)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector unsigned int __a, vector signed int __b,
+          vector signed int __c) {
+  return (vector signed int)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * (vector signed int)__b + (vector signed int)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mladd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * __b + __c;
+}
+
+/*-- vec_mhadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mhadd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __builtin_s390_vmahb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mhadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __builtin_s390_vmalhb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mhadd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_s390_vmahh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mhadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalhh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mhadd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __builtin_s390_vmahf(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mhadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmalhf(__a, __b, __c);
+}
+
+/*-- vec_meadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_meadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaeb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_meadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmaleb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_meadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaeh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_meadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaleh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_meadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaef(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_meadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalef(__a, __b, __c);
+}
+
+/*-- vec_moadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_moadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_moadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_moadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaoh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_moadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaloh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_moadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaof(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_moadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalof(__a, __b, __c);
+}
+
+/*-- vec_mulh ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mulh(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mulh(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulh(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmlhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulh(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmhf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulh(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlhf(__a, __b);
+}
+
+/*-- vec_mule ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mule(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmleb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mule(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmleh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mule(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmef(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlef(__a, __b);
+}
+
+/*-- vec_mulo ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulo(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulo(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmoh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmloh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mulo(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmof(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlof(__a, __b);
+}
+
+/*-- vec_sub_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sub_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsq(__a, __b);
+}
+
+/*-- vec_subc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_subc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbib(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_subc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vscbih(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vscbif(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_subc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vscbig(__a, __b);
+}
+
+/*-- vec_subc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbiq(__a, __b);
+}
+
+/*-- vec_sube_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sube_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vsbiq(__a, __b, __c);
+}
+
+/*-- vec_subec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vsbcbiq(__a, __b, __c);
+}
+
+/*-- vec_sum2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumgf(__a, __b);
+}
+
+/*-- vec_sum_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumqf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vsumqg(__a, __b);
+}
+
+/*-- vec_sum4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsumb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumh(__a, __b);
+}
+
+/*-- vec_test_mask ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm(__a, __b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector double __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+/*-- vec_madd ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmadb(__a, __b, __c);
+}
+
+/*-- vec_msub ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmsdb(__a, __b, __c);
+}
+
+/*-- vec_sqrt ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_sqrt(vector double __a) {
+  return __builtin_s390_vfsqdb(__a);
+}
+
+/*-- vec_ld2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ld2f(const float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  return __builtin_convertvector(*(const __v2f32 *)__ptr, vector double);
+}
+
+/*-- vec_st2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai void
+vec_st2f(vector double __a, float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  *(__v2f32 *)__ptr = __builtin_convertvector(__a, __v2f32);
+}
+
+/*-- vec_ctd ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector signed long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector unsigned long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+/*-- vec_ctsl ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_ctsl(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_ctul ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_ctul(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+/*-- vec_roundp -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundp(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_ceil ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ceil(vector double __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_roundm -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundm(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_floor --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_floor(vector double __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_roundz -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundz(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_trunc --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_trunc(vector double __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_roundc -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundc(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 0);
+}
+
+/*-- vec_round --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_round(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 4);
+}
+
+/*-- vec_fp_test_data_class -------------------------------------------------*/
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+
+/*-- vec_cp_until_zero ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero(vector signed char __a) {
+  return (vector signed char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero(vector bool char __a) {
+  return (vector bool char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero(vector unsigned char __a) {
+  return __builtin_s390_vistrb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero(vector signed short __a) {
+  return (vector signed short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero(vector bool short __a) {
+  return (vector bool short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero(vector unsigned short __a) {
+  return __builtin_s390_vistrh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero(vector signed int __a) {
+  return (vector signed int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero(vector bool int __a) {
+  return (vector bool int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero(vector unsigned int __a) {
+  return __builtin_s390_vistrf(__a);
+}
+
+/*-- vec_cp_until_zero_cc ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero_cc(vector signed char __a, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero_cc(vector bool char __a, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero_cc(vector unsigned char __a, int *__cc) {
+  return __builtin_s390_vistrbs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero_cc(vector signed short __a, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero_cc(vector bool short __a, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero_cc(vector unsigned short __a, int *__cc) {
+  return __builtin_s390_vistrhs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero_cc(vector signed int __a, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vistrfs((vector unsigned int)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero_cc(vector bool int __a, int *__cc) {
+  return (vector bool int)__builtin_s390_vistrfs((vector unsigned int)__a,
+                                                 __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero_cc(vector unsigned int __a, int *__cc) {
+  return __builtin_s390_vistrfs(__a, __cc);
+}
+
+/*-- vec_cmpeq_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeeb((vector unsigned char)__a,
+                         (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeeb((vector unsigned char)__a,
+                              (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeeh((vector unsigned short)__a,
+                         (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeeh((vector unsigned short)__a,
+                              (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeef((vector unsigned int)__a,
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeef((vector unsigned int)__a,
+                              (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeef(__a, __b);
+}
+
+/*-- vec_cmpeq_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfeebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfeehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfeefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpeq_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeezb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeezb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeezh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeezh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeezf((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeezf((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeezf(__a, __b);
+}
+
+/*-- vec_cmpeq_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeneb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeneb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeneb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeneh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeneh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeneh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenef((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenef((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenef(__a, __b);
+}
+
+/*-- vec_cmpne_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenebs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenebs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfenebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenehs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenehs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfenehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenefs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenefs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfenefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfenezb((vector unsigned char)__a,
+                           (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfenezb((vector unsigned char)__a,
+                                (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfenezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfenezh((vector unsigned short)__a,
+                           (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfenezh((vector unsigned short)__a,
+                                (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfenezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenezf((vector unsigned int)__a,
+                           (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenezf((vector unsigned int)__a,
+                                (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenezf(__a, __b);
+}
+
+/*-- vec_cmpne_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenezbs((vector unsigned char)__a,
+                            (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenezbs((vector unsigned char)__a,
+                                 (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenezhs((vector unsigned short)__a,
+                            (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenezhs((vector unsigned short)__a,
+                                 (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenezfs((vector unsigned int)__a,
+                            (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenezfs((vector unsigned int)__a,
+                                 (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmprg --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 4);
+}
+
+/*-- vec_cmprg_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg_cc(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg_cc(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg_cc(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 4, __cc);
+}
+
+/*-- vec_cmprg_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmprg_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                   vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                   vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                   vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmpnrg -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg(vector unsigned char __a, vector unsigned char __b,
+           vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg(vector unsigned short __a, vector unsigned short __b,
+           vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg(vector unsigned int __a, vector unsigned int __b,
+           vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 12);
+}
+
+/*-- vec_cmpnrg_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg_cc(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg_cc(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg_cc(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 12, __cc);
+}
+
+/*-- vec_cmpnrg_idx ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx(vector unsigned short __a, vector unsigned short __b,
+               vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx(vector unsigned int __a, vector unsigned int __b,
+               vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_idx_cc ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                  vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                  vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                  vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_cmpnrg_or_0_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                    vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                    vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                    vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_or_0_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_find_any_eq --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 4);
+}
+
+/*-- vec_find_any_eq_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 4, __cc);
+}
+
+/*-- vec_find_any_eq_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_eq_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_ne --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 12);
+}
+
+/*-- vec_find_any_ne_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 12, __cc);
+}
+
+/*-- vec_find_any_ne_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 8, __cc);
+}
+
+/*-- vec_find_any_ne_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 8, __cc);
+}
+
+#undef __constant_pow2_range
+#undef __constant_range
+#undef __constant
+#undef __ATTRS_o
+#undef __ATTRS_o_ai
+#undef __ATTRS_ai
+
+#else
+
+#error "Use -fzvector to enable vector extensions"
+
+#endif

diff --git a/24.0.3/clang-include/wmmintrin.h b/24.0.3/clang-include/wmmintrin.h
new file mode 100644
index 0000000..a2d9310
--- /dev/null
+++ b/24.0.3/clang-include/wmmintrin.h

@@ -0,0 +1,33 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#include <__wmmintrin_aes.h>
+
+#include <__wmmintrin_pclmul.h>
+
+#endif /* _WMMINTRIN_H */

diff --git a/24.0.3/clang-include/x86intrin.h b/24.0.3/clang-include/x86intrin.h
new file mode 100644
index 0000000..4d8077e
--- /dev/null
+++ b/24.0.3/clang-include/x86intrin.h

@@ -0,0 +1,57 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#include <mm3dnow.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <lzcntintrin.h>
+
+#include <popcntintrin.h>
+
+#include <rdseedintrin.h>
+
+#include <prfchwintrin.h>
+
+#include <ammintrin.h>
+
+#include <fma4intrin.h>
+
+#include <xopintrin.h>
+
+#include <tbmintrin.h>
+
+#include <f16cintrin.h>
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */

diff --git a/24.0.3/clang-include/xmmintrin.h b/24.0.3/clang-include/xmmintrin.h
new file mode 100644
index 0000000..ae0b2cd
--- /dev/null
+++ b/24.0.3/clang-include/xmmintrin.h

@@ -0,0 +1,1010 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpless(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpltss(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpless(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnless(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpnltss(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpnless(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordps(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64(__a);
+}
+
+#endif
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi(__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_si32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_si64(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi(__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_undefined_ps()
+{
+  return (__m128)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_storeups(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
+  _mm_storeu_ps(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps1(float *__p, __m128 __a)
+{
+    return _mm_store1_ps(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128 *)__p = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntps(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_sfence(void)
+{
+  __builtin_ia32_sfence();
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_extract_pi16(__m64 __a, int __n)
+{
+  __v4hi __b = (__v4hi)__a;
+  return (unsigned short)__b[__n & 3];
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_insert_pi16(__m64 __a, int __d, int __n)
+{
+   __v4hi __b = (__v4hi)__a;
+   __b[__n & 3] = __d;
+   return (__m64)__b;
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_getcsr(void)
+{
+  return __builtin_ia32_stmxcsr();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_setcsr(unsigned int __i)
+{
+  __builtin_ia32_ldmxcsr(__i);
+}
+
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
+                                  (((mask) & 0x30) >> 4) + 4, \
+                                  (((mask) & 0xc0) >> 6) + 4); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+
+  return _mm_packs_pi32(__b, __c);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+
+  return _mm_packs_pi16(__b, __c);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps(__a);
+}
+
+
+#ifdef _MSC_VER
+#define _MM_ALIGN16 __declspec(align(16))
+#endif
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#if defined(__SSE2__) && !__has_feature(modules)
+#include <emmintrin.h>
+#endif
+
+#endif /* __XMMINTRIN_H */

diff --git a/24.0.3/clang-include/xopintrin.h b/24.0.3/clang-include/xopintrin.h
new file mode 100644
index 0000000..f07f51c
--- /dev/null
+++ b/24.0.3/clang-include/xopintrin.h

@@ -0,0 +1,782 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#include <fma4intrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                 (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                 (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                 (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                 (__v2di)(__m128i)(B), (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+                                     (__v2df)(__m128d)(Y), \
+                                     (__v2di)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+                                        (__v4df)(__m256d)(Y), \
+                                        (__v4di)(__m256i)(C), (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                    (__v4si)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+                                       (__v8sf)(__m256)(Y), \
+                                       (__v8si)(__m256i)(C), (I)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __XOPINTRIN_H */

diff --git a/24.0.3/clang-include/xsavecintrin.h b/24.0.3/clang-include/xsavecintrin.h
new file mode 100644
index 0000000..598470a
--- /dev/null
+++ b/24.0.3/clang-include/xsavecintrin.h

@@ -0,0 +1,48 @@
+/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVECINTRIN_H
+#define __XSAVECINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsavec")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/xsaveintrin.h b/24.0.3/clang-include/xsaveintrin.h
new file mode 100644
index 0000000..a2e6b2e
--- /dev/null
+++ b/24.0.3/clang-include/xsaveintrin.h

@@ -0,0 +1,58 @@
+/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEINTRIN_H
+#define __XSAVEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/xsaveoptintrin.h b/24.0.3/clang-include/xsaveoptintrin.h
new file mode 100644
index 0000000..d3faae7
--- /dev/null
+++ b/24.0.3/clang-include/xsaveoptintrin.h

@@ -0,0 +1,48 @@
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEOPTINTRIN_H
+#define __XSAVEOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaveopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/xsavesintrin.h b/24.0.3/clang-include/xsavesintrin.h
new file mode 100644
index 0000000..c5e540a
--- /dev/null
+++ b/24.0.3/clang-include/xsavesintrin.h

@@ -0,0 +1,58 @@
+/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVESINTRIN_H
+#define __XSAVESINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaves")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/24.0.3/clang-include/xtestintrin.h b/24.0.3/clang-include/xtestintrin.h
new file mode 100644
index 0000000..9d3378f
--- /dev/null
+++ b/24.0.3/clang-include/xtestintrin.h

@@ -0,0 +1,41 @@
+/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XTESTINTRIN_H
+#define __XTESTINTRIN_H
+
+/* xtest returns non-zero if the instruction is executed within an RTM or active
+ * HLE region. */
+/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
+ * supported. */
+static __inline__ int
+    __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+    _xtest(void) {
+  return __builtin_ia32_xtest();
+}
+
+#endif

diff --git a/24.0.3/include/rs_allocation_create.rsh b/24.0.3/include/rs_allocation_create.rsh
new file mode 100644
index 0000000..d7f9fd6
--- /dev/null
+++ b/24.0.3/include/rs_allocation_create.rsh

@@ -0,0 +1,1345 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_allocation_create.rsh: Allocation Creation Functions
+ *
+ * The functions below can be used to create Allocations from a Script.
+ *
+ * These functions can be called directly or indirectly from an invokable
+ * function.  If some control-flow path can result in a call to these functions
+ * from a RenderScript kernel function, a compiler error will be generated.
+ */
+
+#ifndef RENDERSCRIPT_RS_ALLOCATION_CREATE_RSH
+#define RENDERSCRIPT_RS_ALLOCATION_CREATE_RSH
+
+/*
+ * rsCreateElement: Creates an rs_element object of the specified data type
+ *
+ *  Creates an rs_element object of the specified data type.  The data kind of
+ *  the Element will be set to RS_KIND_USER and vector_width will be set to 1,
+ *  indicating non-vector.
+ *
+ * Parameters:
+ *   data_type: Data type of the Element
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_element __attribute__((overloadable))
+    rsCreateElement(rs_data_type data_type);
+#endif
+
+/*
+ * rsCreateVectorElement: Creates an rs_element object of the specified data type and vector width
+ *
+ *  Creates an rs_element object of the specified data type and vector width.
+ *  Value of vector_width must be 2, 3 or 4.  The data kind of the Element will
+ *  be set to RS_KIND_USER.
+ *
+ * Parameters:
+ *   data_type: Data type of the Element
+ *   vector_width: Vector width (either 2, 3, or 4)
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_element __attribute__((overloadable))
+    rsCreateVectorElement(rs_data_type data_type, uint32_t vector_width);
+#endif
+
+/*
+ * rsCreatePixelElement: Creates an rs_element object of the specified data type and data kind
+ *
+ *  Creates an rs_element object of the specified data type and data kind.  The
+ *  vector_width of the Element will be set to 1, indicating non-vector.
+ *
+ * Parameters:
+ *   data_type: Data type of the Element
+ *   data_kind: Data kind of the Element
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_element __attribute__((overloadable))
+    rsCreatePixelElement(rs_data_type data_type, rs_data_kind data_kind);
+#endif
+
+/*
+ * rsCreateType: Creates an rs_type object with the specified Element and shape attributes
+ *
+ *  Creates an rs_type object with the specified Element and shape attributes.
+ *
+ *  dimX specifies the size of the X dimension.
+ *
+ *  dimY, if present and non-zero, indicates that the Y dimension is present and
+ *  indicates its size.
+ *
+ *  dimZ, if present and non-zero, indicates that the Z dimension is present and
+ *  indicates its size.
+ *
+ *  mipmaps indicates the presence of level of detail (LOD).
+ *
+ *  faces indicates the  presence of cubemap faces.
+ *
+ *  yuv_format indicates the associated YUV format (or RS_YUV_NONE).
+ *
+ * Parameters:
+ *   element: Element to be associated with the Type
+ *   dimX: Size along the X dimension
+ *   dimY: Size along the Y dimension
+ *   dimZ: Size along the Z dimension
+ *   mipmaps: Flag indicating if the Type has a mipmap chain
+ *   faces: Flag indicating if the Type is a cubemap
+ *   yuv_format: YUV layout for the Type
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX, uint32_t dimY, uint32_t dimZ, bool mipmaps,
+                 bool faces, rs_yuv_format yuv_format);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX, uint32_t dimY, uint32_t dimZ);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX, uint32_t dimY);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX);
+#endif
+
+/*
+ * rsCreateAllocation: Create an rs_allocation object of given Type.
+ *
+ *  Creates an rs_allocation object of the given Type and usage.
+ *
+ *  RS_ALLOCATION_USAGE_SCRIPT and RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE are the
+ *  only supported usage flags for Allocations created from within a RenderScript
+ *  Script.
+ *
+ *  You can also use rsCreateAllocation_ wrapper functions to directly
+ *  create Allocations of scalar and vector numerical types without creating
+ *  intermediate rs_element or rs_type objects.
+ *
+ *  E.g. rsCreateAllocation_int4() returns an Allocation of int4 data type of
+ *  specified dimensions.
+ *
+ * Parameters:
+ *   type: Type of the Allocation
+ *   usage: Usage flag for the allocation
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_allocation __attribute__((overloadable))
+    rsCreateAllocation(rs_type type, uint32_t usage);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_allocation __attribute__((overloadable))
+    rsCreateAllocation(rs_type type);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_16);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_32);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_64);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_16);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_32);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_64);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_16);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_32);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_64);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_8);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_8);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_16);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_16);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_32);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_32);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_64);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_64);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#endif // RENDERSCRIPT_RS_ALLOCATION_CREATE_RSH

diff --git a/24.0.3/include/rs_allocation_data.rsh b/24.0.3/include/rs_allocation_data.rsh
new file mode 100644
index 0000000..ea16767
--- /dev/null
+++ b/24.0.3/include/rs_allocation_data.rsh

@@ -0,0 +1,3365 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_allocation_data.rsh: Allocation Data Access Functions
+ *
+ * The functions below can be used to get and set the cells that comprise
+ * an allocation.
+ *
+ * - Individual cells are accessed using the rsGetElementAt* and
+ *   rsSetElementAt functions.
+ * - Multiple cells can be copied using the rsAllocationCopy* and
+ *   rsAllocationV* functions.
+ * - For getting values through a sampler, use rsSample.
+ *
+ * The rsGetElementAt and rsSetElement* functions are somewhat misnamed.
+ * They don't get or set elements, which are akin to data types; they get
+ * or set cells.  Think of them as rsGetCellAt and and rsSetCellAt.
+ */
+
+#ifndef RENDERSCRIPT_RS_ALLOCATION_DATA_RSH
+#define RENDERSCRIPT_RS_ALLOCATION_DATA_RSH
+
+/*
+ * rsAllocationCopy1DRange: Copy consecutive cells between allocations
+ *
+ * Copies the specified number of cells from one allocation to another.
+ *
+ * The two allocations must be different.  Using this function to copy whithin
+ * the same allocation yields undefined results.
+ *
+ * The function does not validate whether the offset plus count exceeds the size
+ * of either allocation.  Be careful!
+ *
+ * This function should only be called between 1D allocations.  Calling it
+ * on other allocations is undefined.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   dstAlloc: Allocation to copy cells into.
+ *   dstOff: Offset in the destination of the first cell to be copied into.
+ *   dstMip: Mip level in the destination allocation.  0 if mip mapping is not used.
+ *   count: Number of cells to be copied.
+ *   srcAlloc: Source allocation.
+ *   srcOff: Offset in the source of the first cell to be copied.
+ *   srcMip: Mip level in the source allocation.  0 if mip mapping is not used.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsAllocationCopy1DRange(rs_allocation dstAlloc, uint32_t dstOff, uint32_t dstMip, uint32_t count,
+                            rs_allocation srcAlloc, uint32_t srcOff, uint32_t srcMip);
+#endif
+
+/*
+ * rsAllocationCopy2DRange: Copy a rectangular region of cells between allocations
+ *
+ * Copies a rectangular region of cells from one allocation to another.
+ * (width * heigth) cells are copied.
+ *
+ * The two allocations must be different.  Using this function to copy whithin
+ * the same allocation yields undefined results.
+ *
+ * The function does not validate whether the the source or destination region
+ * exceeds the size of its respective allocation.  Be careful!
+ *
+ * This function should only be called between 2D allocations.  Calling it
+ * on other allocations is undefined.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   dstAlloc: Allocation to copy cells into.
+ *   dstXoff: X offset in the destination of the region to be set.
+ *   dstYoff: Y offset in the destination of the region to be set.
+ *   dstMip: Mip level in the destination allocation.  0 if mip mapping is not used.
+ *   dstFace: Cubemap face of the destination allocation.  Ignored for allocations that aren't cubemaps.
+ *   width: Width of the incoming region to update.
+ *   height: Height of the incoming region to update.
+ *   srcAlloc: Source allocation.
+ *   srcXoff: X offset in the source.
+ *   srcYoff: Y offset in the source.
+ *   srcMip: Mip level in the source allocation.  0 if mip mapping is not used.
+ *   srcFace: Cubemap face of the source allocation.  Ignored for allocations that aren't cubemaps.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsAllocationCopy2DRange(rs_allocation dstAlloc, uint32_t dstXoff, uint32_t dstYoff,
+                            uint32_t dstMip, rs_allocation_cubemap_face dstFace, uint32_t width,
+                            uint32_t height, rs_allocation srcAlloc, uint32_t srcXoff,
+                            uint32_t srcYoff, uint32_t srcMip, rs_allocation_cubemap_face srcFace);
+#endif
+
+/*
+ * rsAllocationVLoadX: Get a vector from an allocation of scalars
+ *
+ * This function returns a vector composed of successive cells of the allocation.
+ * It assumes that the allocation contains scalars.
+ *
+ * The "X" in the name indicates that successive values are extracted by
+ * increasing the X index.  There are currently no functions to get successive
+ * values incrementing other dimensions.  Use multiple calls to rsGetElementAt()
+ * instead.
+ *
+ * For example, when calling rsAllocationVLoadX_int4(a, 20, 30), an int4 composed
+ * of a[20, 30], a[21, 30], a[22, 30], and a[23, 30] is returned.
+ *
+ * When retrieving from a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * For efficiency, this function does not validate the inputs.  Trying to wrap
+ * the X index, exceeding the size of the allocation, or using indices incompatible
+ * with the dimensionality of the allocation yields undefined results.
+ *
+ * See also rsAllocationVStoreX().
+ *
+ * Parameters:
+ *   a: Allocation to get the data from.
+ *   x: X offset in the allocation of the first cell to be copied from.
+ *   y: Y offset in the allocation of the first cell to be copied from.
+ *   z: Z offset in the allocation of the first cell to be copied from.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsAllocationVStoreX: Store a vector into an allocation of scalars
+ *
+ * This function stores the entries of a vector into successive cells of an allocation.
+ * It assumes that the allocation contains scalars.
+ *
+ * The "X" in the name indicates that successive values are stored by increasing
+ * the X index.  There are currently no functions to store successive values
+ * incrementing other dimensions.  Use multiple calls to rsSetElementAt() instead.
+ *
+ * For example, when calling rsAllocationVStoreX_int3(a, v, 20, 30), v.x is stored
+ * at a[20, 30], v.y at a[21, 30], and v.z at a[22, 30].
+ *
+ * When storing into a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * For efficiency, this function does not validate the inputs.  Trying to wrap the
+ * X index, exceeding the size of the allocation, or using indices incompatible
+ * with the dimensionality of the allocation yiels undefined results.
+ *
+ * See also rsAllocationVLoadX().
+ *
+ * Parameters:
+ *   a: Allocation to store the data into.
+ *   val: Value to be stored.
+ *   x: X offset in the allocation of the first cell to be copied into.
+ *   y: Y offset in the allocation of the first cell to be copied into.
+ *   z: Z offset in the allocation of the first cell to be copied into.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsGetElementAt: Return a cell from an allocation
+ *
+ * This function extracts a single cell from an allocation.
+ *
+ * When retrieving from a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * This function has two styles.  One returns the address of the value using a void*,
+ * the other returns the actual value, e.g. rsGetElementAt() vs. rsGetElementAt_int4().
+ * For primitive types, always use the latter as it is more efficient.
+ */
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x);
+
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y);
+
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x) {
+    return ((float *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x) {
+    return ((float2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x) {
+    return ((float3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x) {
+    return ((float4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x) {
+    return ((double *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x) {
+    return ((double2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x) {
+    return ((double3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x) {
+    return ((double4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x) {
+    return ((char *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x) {
+    return ((char2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x) {
+    return ((char3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x) {
+    return ((char4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x) {
+    return ((uchar *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x) {
+    return ((uchar2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x) {
+    return ((uchar3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x) {
+    return ((uchar4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x) {
+    return ((short *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x) {
+    return ((short2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x) {
+    return ((short3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x) {
+    return ((short4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x) {
+    return ((ushort *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x) {
+    return ((ushort2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x) {
+    return ((ushort3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x) {
+    return ((ushort4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x) {
+    return ((int *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x) {
+    return ((int2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x) {
+    return ((int3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x) {
+    return ((int4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x) {
+    return ((uint *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x) {
+    return ((uint2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x) {
+    return ((uint3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x) {
+    return ((uint4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x) {
+    return ((long *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x) {
+    return ((long2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x) {
+    return ((long3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x) {
+    return ((long4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x) {
+    return ((ulong *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x) {
+    return ((ulong2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x) {
+    return ((ulong3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x) {
+    return ((ulong4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_U: Get the U component of an allocation of YUVs
+ *
+ * Extracts the U component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_Y().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_V: Get the V component of an allocation of YUVs
+ *
+ * Extracts the V component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_Y().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_Y: Get the Y component of an allocation of YUVs
+ *
+ * Extracts the Y component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_U() and rsGetElementAtYuv_uchar_V().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_Y(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsSample: Sample a value from a texture allocation
+ *
+ * Fetches a value from a texture allocation in a way described by the sampler.
+ *
+ * If your allocation is 1D, use the variant with float for location.  For 2D,
+ * use the float2 variant.
+ *
+ * See android.renderscript.Sampler for more details.
+ *
+ * Parameters:
+ *   a: Allocation to sample from.
+ *   s: Sampler state.
+ *   location: Location to sample from.
+ *   lod: Mip level to sample from, for fractional values mip levels will be interpolated if RS_SAMPLER_LINEAR_MIP_LINEAR is used.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location, float lod);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location, float lod);
+#endif
+
+/*
+ * rsSetElementAt: Set a cell of an allocation
+ *
+ * This function stores a value into a single cell of an allocation.
+ *
+ * When storing into a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for
+ * the mono dimensional allocations.
+ *
+ * This function has two styles.  One passes the value to be stored using a void*,
+ * the other has the actual value as an argument, e.g. rsSetElementAt() vs.
+ * rsSetElementAt_int4().  For primitive types, always use the latter as it is
+ * more efficient.
+ *
+ * See also rsGetElementAt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#endif // RENDERSCRIPT_RS_ALLOCATION_DATA_RSH

diff --git a/24.0.3/include/rs_atomic.rsh b/24.0.3/include/rs_atomic.rsh
new file mode 100644
index 0000000..98a8784
--- /dev/null
+++ b/24.0.3/include/rs_atomic.rsh

@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_atomic.rsh: Atomic Update Functions
+ *
+ * To update values shared between multiple threads, use the functions below.
+ * They ensure that the values are atomically updated, i.e. that the memory
+ * reads, the updates, and the memory writes are done in the right order.
+ *
+ * These functions are slower than their non-atomic equivalents, so use
+ * them only when synchronization is needed.
+ *
+ * Note that in RenderScript, your code is likely to be running in separate
+ * threads even though you did not explicitely create them.  The RenderScript
+ * runtime will very often split the execution of one kernel across multiple
+ * threads.  Updating globals should be done with atomic functions.  If possible,
+ * modify your algorithm to avoid them altogether.
+ */
+
+#ifndef RENDERSCRIPT_RS_ATOMIC_RSH
+#define RENDERSCRIPT_RS_ATOMIC_RSH
+
+/*
+ * rsAtomicAdd: Thread-safe addition
+ *
+ * Atomicly adds a value to the value at addr, i.e. *addr += value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Amount to add.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicAnd: Thread-safe bitwise and
+ *
+ * Atomicly performs a bitwise and of two values, storing the result back at addr,
+ * i.e. *addr &= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to and with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicCas: Thread-safe compare and set
+ *
+ * If the value at addr matches compareValue then the newValue is written at addr,
+ * i.e. if (*addr == compareValue) { *addr = newValue; }.
+ *
+ * You can check that the value was written by checking that the value returned
+ * by rsAtomicCas() is compareValue.
+ *
+ * Parameters:
+ *   addr: Address of the value to compare and replace if the test passes.
+ *   compareValue: Value to test *addr against.
+ *   newValue: Value to write if the test passes.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicCas(volatile int32_t* addr, int32_t compareValue, int32_t newValue);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicCas(volatile uint32_t* addr, uint32_t compareValue, uint32_t newValue);
+#endif
+
+/*
+ * rsAtomicDec: Thread-safe decrement
+ *
+ * Atomicly subtracts one from the value at addr.  This is equivalent to rsAtomicSub(addr, 1).
+ *
+ * Parameters:
+ *   addr: Address of the value to decrement.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile int32_t* addr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile uint32_t* addr);
+#endif
+
+/*
+ * rsAtomicInc: Thread-safe increment
+ *
+ * Atomicly adds one to the value at addr.  This is equivalent to rsAtomicAdd(addr, 1).
+ *
+ * Parameters:
+ *   addr: Address of the value to increment.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile int32_t* addr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile uint32_t* addr);
+#endif
+
+/*
+ * rsAtomicMax: Thread-safe maximum
+ *
+ * Atomicly sets the value at addr to the maximum of *addr and value, i.e.
+ * *addr = max(*addr, value).
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Comparison value.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMax(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicMax(volatile int32_t* addr, int32_t value);
+#endif
+
+/*
+ * rsAtomicMin: Thread-safe minimum
+ *
+ * Atomicly sets the value at addr to the minimum of *addr and value, i.e.
+ * *addr = min(*addr, value).
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Comparison value.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMin(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicMin(volatile int32_t* addr, int32_t value);
+#endif
+
+/*
+ * rsAtomicOr: Thread-safe bitwise or
+ *
+ * Atomicly perform a bitwise or two values, storing the result at addr,
+ * i.e. *addr |= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to or with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicSub: Thread-safe subtraction
+ *
+ * Atomicly subtracts a value from the value at addr, i.e. *addr -= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Amount to subtract.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicXor: Thread-safe bitwise exclusive or
+ *
+ * Atomicly performs a bitwise xor of two values, storing the result at addr,
+ * i.e. *addr ^= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to xor with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#endif // RENDERSCRIPT_RS_ATOMIC_RSH

diff --git a/24.0.3/include/rs_convert.rsh b/24.0.3/include/rs_convert.rsh
new file mode 100644
index 0000000..4c318d4
--- /dev/null
+++ b/24.0.3/include/rs_convert.rsh

@@ -0,0 +1,1623 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_convert.rsh: Conversion Functions
+ *
+ * The functions below convert from a numerical vector type to another, or from one color
+ * representation to another.
+ */
+
+#ifndef RENDERSCRIPT_RS_CONVERT_RSH
+#define RENDERSCRIPT_RS_CONVERT_RSH
+
+/*
+ * convert: Convert numerical vectors
+ *
+ * Converts a vector from one numerical type to another.  The conversion are done entry per entry.
+ *
+ * E.g calling a = convert_short3(b); is equivalent to doing
+ * a.x = (short)b.x; a.y = (short)b.y; a.z = (short)b.z;.
+ *
+ * Converting floating point values to integer types truncates.
+ *
+ * Converting numbers too large to fit the destination type yields undefined results.
+ * For example, converting a float that contains 1.0e18 to a short is undefined.
+ * Use clamp() to avoid this.
+ */
+extern float2 __attribute__((const, overloadable))
+    convert_float2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(float4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(char2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(char3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(char4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(uchar2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(uchar3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(uchar4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(short2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(short3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(short4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(ushort2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(ushort3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(ushort4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(int2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(int3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(int4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(uint2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(uint3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(uint4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(float2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(float3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(float4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(char2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(char3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(char4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(uchar2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(uchar3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(uchar4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(short2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(short3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(short4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(ushort2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(ushort3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(ushort4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(int2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(int3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(int4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(uint2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(uint3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(uint4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(float2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(float3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(float4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(char2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(char3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(char4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(uchar2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(uchar3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(uchar4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(short2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(short3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(short4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(ushort2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(ushort3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(ushort4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(int2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(int3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(int4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(uint2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(uint3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(uint4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(float2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(float3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(float4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(char2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(char3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(char4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(uchar2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(uchar3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(uchar4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(short2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(short3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(short4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(ushort2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(ushort3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(ushort4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(int2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(int3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(int4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(uint2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(uint3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(uint4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(float2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(float3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(float4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(char2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(char3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(char4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(uchar2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(uchar3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(uchar4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(short2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(short3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(short4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(ushort2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(ushort3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(ushort4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(int2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(int3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(int4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(uint2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(uint3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(uint4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(float2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(float3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(float4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(char2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(char3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(char4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(uchar2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(uchar3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(uchar4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(short2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(short3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(short4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(ushort2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(ushort3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(ushort4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(int2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(int3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(int4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(uint2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(uint3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(uint4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(float2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(float3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(float4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(char2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(char3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(char4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(uchar2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(uchar3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(uchar4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(short2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(short3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(short4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(ushort2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(ushort3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(ushort4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(int2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(int3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(int4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(uint2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(uint3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(uint4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(ulong4 v);
+#endif
+
+/*
+ * rsPackColorTo8888: Create a uchar4 RGBA from floats
+ *
+ * Packs three or four floating point RGBA values into a uchar4.
+ *
+ * The input values are typically between 0.0f and 1.0f inclusive.  For input values outside
+ * of this range, the resulting outputs will be clamped to be between 0 and 255.  As this
+ * clamping may be done after the input is multiplied by 255.f and converted to an integer,
+ * input numbers greater than INT_MAX/255.f or less than INT_MIN/255.f result in
+ * undefined behavior.
+ *
+ * If the alpha component is not specified, it is assumed to be 1.0, i.e. the result will
+ * have an alpha set to 255.
+ *
+ * Parameters:
+ *   r: Red component.
+ *   g: Green component.
+ *   b: Blue component.
+ *   a: Alpha component.
+ *   color: Vector of 3 or 4 floats containing the R, G, B, and A values.
+ */
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float r, float g, float b);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float r, float g, float b, float a);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float3 color);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float4 color);
+
+/*
+ * rsUnpackColor8888: Create a float4 RGBA from uchar4
+ *
+ * Unpacks a uchar4 color to float4.  The resulting floats will be between 0.0 and 1.0 inclusive.
+ */
+extern float4 __attribute__((const))
+    rsUnpackColor8888(uchar4 c);
+
+/*
+ * rsYuvToRGBA: Convert a YUV value to RGBA
+ *
+ * Converts a color from a YUV representation to RGBA.
+ *
+ * We currently don't provide a function to do the reverse conversion.
+ *
+ * Parameters:
+ *   y: Luminance component.
+ *   u: U chrominance component.
+ *   v: V chrominance component.
+ */
+extern float4 __attribute__((const, overloadable))
+    rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+
+#endif // RENDERSCRIPT_RS_CONVERT_RSH

diff --git a/24.0.3/include/rs_core.rsh b/24.0.3/include/rs_core.rsh
new file mode 100644
index 0000000..ae93d60
--- /dev/null
+++ b/24.0.3/include/rs_core.rsh

@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_core.rsh: Overview
+ *
+ * RenderScript is a high-performance runtime that provides compute operations at the native level.
+ * RenderScript code is compiled on devices at runtime to allow platform-independence as well.
+ *
+ * This reference documentation describes the RenderScript runtime APIs, which you can utilize
+ * to write RenderScript code in C99. The RenderScript compute header files are automatically
+ * included for you.
+ *
+ * To use RenderScript, you need to utilize the RenderScript runtime APIs documented here as well
+ * as the Android framework APIs for RenderScript.  For documentation on the Android framework
+ * APIs, see the android.renderscript package reference.
+ *
+ * For more information on how to develop with RenderScript and how the runtime and Android
+ * framework APIs interact, see the RenderScript developer guide and the RenderScript samples.
+ */
+
+#ifndef RENDERSCRIPT_RS_CORE_RSH
+#define RENDERSCRIPT_RS_CORE_RSH
+
+#define RS_KERNEL __attribute__((kernel))
+
+#include "stdbool.h"
+
+#include "rs_value_types.rsh"
+#include "rs_object_types.rsh"
+
+#include "rs_allocation_create.rsh"
+#include "rs_allocation_data.rsh"
+#include "rs_atomic.rsh"
+#include "rs_convert.rsh"
+#include "rs_debug.rsh"
+#include "rs_for_each.rsh"
+#include "rs_io.rsh"
+#include "rs_math.rsh"
+#include "rs_matrix.rsh"
+#include "rs_object_info.rsh"
+#include "rs_quaternion.rsh"
+#include "rs_time.rsh"
+#include "rs_vector_math.rsh"
+
+#endif // RENDERSCRIPT_RS_CORE_RSH

diff --git a/24.0.3/include/rs_debug.rsh b/24.0.3/include/rs_debug.rsh
new file mode 100644
index 0000000..13c5faa
--- /dev/null
+++ b/24.0.3/include/rs_debug.rsh

@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_debug.rsh: Debugging Functions
+ *
+ * The functions below are intended to be used during application developement.
+ * They should not be used in shipping applications.
+ */
+
+#ifndef RENDERSCRIPT_RS_DEBUG_RSH
+#define RENDERSCRIPT_RS_DEBUG_RSH
+
+#define RS_DEBUG(a) rsDebug(#a, a)
+#define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
+
+/*
+ * rsDebug: Log a message and values
+ *
+ * This function prints a message to the standard log, followed by the provided values.
+ *
+ * This function is intended for debugging only and should not be used in shipping
+ * applications.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong a);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double4 a);
+#endif
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float2 a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float3 a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float4 a);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort4 a);
+#endif
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b, float c);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b, float c, float d);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, unsigned long long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const void* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix4x4* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix3x3* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix2x2* a);
+
+#endif // RENDERSCRIPT_RS_DEBUG_RSH

diff --git a/24.0.3/include/rs_for_each.rsh b/24.0.3/include/rs_for_each.rsh
new file mode 100644
index 0000000..bcc5db9
--- /dev/null
+++ b/24.0.3/include/rs_for_each.rsh

@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_for_each.rsh: Kernel Invocation Functions and Types
+ *
+ * The rsForEach() function can be used to invoke the root kernel of a script.
+ *
+ * The other functions are used to get the characteristics of the invocation of
+ * an executing kernel, like dimensions and current indices.  These functions take
+ * a rs_kernel_context as argument.
+ */
+
+#ifndef RENDERSCRIPT_RS_FOR_EACH_RSH
+#define RENDERSCRIPT_RS_FOR_EACH_RSH
+
+/*
+ * rs_for_each_strategy_t: Suggested cell processing order
+ *
+ * This type is used to suggest how the invoked kernel should iterate over the cells of the
+ * allocations.  This is a hint only.  Implementations may not follow the suggestion.
+ *
+ * This specification can help the caching behavior of the running kernel, e.g. the cache
+ * locality when the processing is distributed over multiple cores.
+ */
+typedef enum rs_for_each_strategy {
+    RS_FOR_EACH_STRATEGY_SERIAL = 0, // Prefer contiguous memory regions.
+    RS_FOR_EACH_STRATEGY_DONT_CARE = 1, // No prefrences.
+    RS_FOR_EACH_STRATEGY_DST_LINEAR = 2, // Prefer DST.
+    RS_FOR_EACH_STRATEGY_TILE_SMALL = 3, // Prefer processing small rectangular regions.
+    RS_FOR_EACH_STRATEGY_TILE_MEDIUM = 4, // Prefer processing medium rectangular regions.
+    RS_FOR_EACH_STRATEGY_TILE_LARGE = 5 // Prefer processing large rectangular regions.
+} rs_for_each_strategy_t;
+
+/*
+ * rs_kernel_context: Handle to a kernel invocation context
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over, like dimensions.  It also contains rarely used indices of the currently processed
+ * cell, like the Array0 index or the current level of detail.
+ *
+ * You can access the kernel context by adding a special parameter named "context" of type
+ * rs_kernel_context to your kernel function.  See rsGetDimX() and rsGetArray0() for examples.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef const struct rs_kernel_context_t * rs_kernel_context;
+#endif
+
+/*
+ * rs_script_call_t: Cell iteration information
+ *
+ * This structure is used to provide iteration information to a rsForEach call.
+ * It is currently used to restrict processing to a subset of cells.  In future
+ * versions, it will also be used to provide hint on how to best iterate over
+ * the cells.
+ *
+ * The Start fields are inclusive and the End fields are exclusive.  E.g. to iterate
+ * over cells 4, 5, 6, and 7 in the X dimension, set xStart to 4 and xEnd to 8.
+ */
+typedef struct rs_script_call {
+    rs_for_each_strategy_t strategy; // Currently ignored.  In the future, will be suggested cell iteration strategy.
+    uint32_t xStart; // Starting index in the X dimension.
+    uint32_t xEnd; // Ending index (exclusive) in the X dimension.
+    uint32_t yStart; // Starting index in the Y dimension.
+    uint32_t yEnd; // Ending index (exclusive) in the Y dimension.
+    uint32_t zStart; // Starting index in the Z dimension.
+    uint32_t zEnd; // Ending index (exclusive) in the Z dimension.
+    uint32_t arrayStart; // Starting index in the Array0 dimension.
+    uint32_t arrayEnd; // Ending index (exclusive) in the Array0 dimension.
+    uint32_t array1Start; // Starting index in the Array1 dimension.
+    uint32_t array1End; // Ending index (exclusive) in the Array1 dimension.
+    uint32_t array2Start; // Starting index in the Array2 dimension.
+    uint32_t array2End; // Ending index (exclusive) in the Array2 dimension.
+    uint32_t array3Start; // Starting index in the Array3 dimension.
+    uint32_t array3End; // Ending index (exclusive) in the Array3 dimension.
+} rs_script_call_t;
+
+/*
+ * rs_kernel: Handle to a kernel function
+ *
+ *  An opaque type for a function that is defined with the kernel attribute.  A value
+ *  of this type can be used in a rsForEach call to launch a kernel.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+typedef void* rs_kernel;
+#endif
+
+/*
+ * rsForEach: Launches a kernel
+ *
+ * Runs the kernel over zero or more input allocations. They are passed after the
+ * rs_kernel argument. If the specified kernel returns a value, an output allocation
+ * must be specified as the last argument. All input allocations,
+ * and the output allocation if it exists, must have the same dimensions.
+ *
+ * This is a synchronous function. A call to this function only returns after all
+ * the work has completed for all cells of the input allocations. If the kernel
+ * function returns any value, the call waits until all results have been written
+ * to the output allocation.
+ *
+ * Up to API level 23, the kernel is implicitly specified as the kernel named
+ * "root" in the specified script, and only a single input allocation can be used.
+ * Starting in API level 24, an arbitrary kernel function can be used,
+ * as specified by the kernel argument. The script argument is removed.
+ * The kernel must be defined in the current script. In addition, more than one
+ * input can be used.
+ *
+ * E.g.
+ * float __attribute__((kernel)) square(float a) {
+ *   return a * a;
+ * }
+ *
+ * void compute(rs_allocation ain, rs_allocation aout) {
+ *   rsForEach(square, ain, aout);
+ * }
+ *
+ *
+ * Parameters:
+ *   script: Script to call.
+ *   input: Allocation to source data from.
+ *   output: Allocation to write date into.
+ *   usrData: User defined data to pass to the script.  May be NULL.
+ *   sc: Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL.
+ *   usrDataLen: Size of the userData structure.  This will be used to perform a shallow copy of the data if necessary.
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   ...: Input and output allocations
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              const rs_script_call_t* sc);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 20))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              size_t usrDataLen, const rs_script_call_t* sc);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 20))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              size_t usrDataLen);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 23))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void
+    rsForEach(rs_kernel kernel, ...);
+#endif
+
+/*
+ * rsForEachWithOptions: Launches a kernel with options
+ *
+ * Launches kernel in a way similar to rsForEach. However, instead of processing
+ * all cells in the input, this function only processes cells in the subspace of
+ * the index space specified in options. With the index space explicitly specified
+ * by options, no input or output allocation is required for a kernel launch using
+ * this API. If allocations are passed in, they must match the number of arguments
+ * and return value expected by the kernel function. The output allocation is
+ * present if and only if the kernel has a non-void return value.
+ *
+ * E.g.,
+ *    rs_script_call_t opts = {0};
+ *    opts.xStart = 0;
+ *    opts.xEnd = dimX;
+ *    opts.yStart = 0;
+ *    opts.yEnd = dimY / 2;
+ *    rsForEachWithOptions(foo, &opts, out, out);
+ *
+ *
+ * Parameters:
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   options: Launch options
+ *   ...: Input and output allocations
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void
+    rsForEachWithOptions(rs_kernel kernel, rs_script_call_t* options, ...);
+#endif
+
+/*
+ * rsGetArray0: Index in the Array0 dimension for the specified kernel context
+ *
+ * Returns the index in the Array0 dimension of the cell being processed, as specified
+ * by the supplied kernel context.
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over and rarely used indices, like the Array0 index.
+ *
+ * You can access the kernel context by adding a special parameter named "context" of
+ * type rs_kernel_context to your kernel function.  E.g.
+ * short RS_KERNEL myKernel(short value, uint32_t x, rs_kernel_context context) {
+ *   // The current index in the common x, y, z dimensions are accessed by
+ *   // adding these variables as arguments.  For the more rarely used indices
+ *   // to the other dimensions, extract them from the kernel context:
+ *   uint32_t index_a0 = rsGetArray0(context);
+ *   //...
+ * }
+ *
+ * This function returns 0 if the Array0 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray0(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray1: Index in the Array1 dimension for the specified kernel context
+ *
+ * Returns the index in the Array1 dimension of the cell being processed, as specified
+ * by the supplied kernel context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns 0 if the Array1 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray1(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray2: Index in the Array2 dimension for the specified kernel context
+ *
+ * Returns the index in the Array2 dimension of the cell being processed,
+ * as specified by the supplied kernel context.  See rsGetArray0() for an explanation
+ * of the context.
+ *
+ * Returns 0 if the Array2 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray2(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray3: Index in the Array3 dimension for the specified kernel context
+ *
+ * Returns the index in the Array3 dimension of the cell being processed, as specified
+ * by the supplied kernel context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns 0 if the Array3 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray3(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray0: Size of the Array0 dimension for the specified kernel context
+ *
+ * Returns the size of the Array0 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array0 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray0(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray1: Size of the Array1 dimension for the specified kernel context
+ *
+ * Returns the size of the Array1 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array1 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray1(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray2: Size of the Array2 dimension for the specified kernel context
+ *
+ * Returns the size of the Array2 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array2 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray2(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray3: Size of the Array3 dimension for the specified kernel context
+ *
+ * Returns the size of the Array3 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array3 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray3(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimHasFaces: Presence of more than one face for the specified kernel context
+ *
+ * If the kernel is iterating over a cubemap, this function returns true if there's more
+ * than one face present.  In all other cases, it returns false.  See rsGetDimX() for an
+ * explanation of the context.
+ *
+ * rsAllocationGetDimFaces() is similar but returns 0 or 1 instead of a bool.
+ *
+ * Returns: Returns true if more than one face is present, false otherwise.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern bool __attribute__((overloadable))
+    rsGetDimHasFaces(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimLod: Number of levels of detail for the specified kernel context
+ *
+ * Returns the number of levels of detail for the specified kernel context.  This is useful
+ * for mipmaps.  See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if Level of Detail is not used.
+ *
+ * rsAllocationGetDimLOD() is similar but returns 0 or 1 instead the actual
+ * number of levels.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimLod(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimX: Size of the X dimension for the specified kernel context
+ *
+ * Returns the size of the X dimension for the specified kernel context.
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over and rarely used indices, like the Array0 index.
+ *
+ * You can access it by adding a special parameter named "context" of
+ * type rs_kernel_context to your kernel function.  E.g.
+ * int4 RS_KERNEL myKernel(int4 value, rs_kernel_context context) {
+ *   uint32_t size = rsGetDimX(context); //...
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimX().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimX(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimY: Size of the Y dimension for the specified kernel context
+ *
+ * Returns the size of the X dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Y dimension is not present.
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimY().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimY(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimZ: Size of the Z dimension for the specified kernel context
+ *
+ * Returns the size of the Z dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Z dimension is not present.
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimZ().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimZ(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetFace: Coordinate of the Face for the specified kernel context
+ *
+ * Returns the face on which the cell being processed is found, as specified by the
+ * supplied kernel context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X if the face dimension is not
+ * present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern rs_allocation_cubemap_face __attribute__((overloadable))
+    rsGetFace(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetLod: Index in the Levels of Detail dimension for the specified kernel context
+ *
+ * Returns the index in the Levels of Detail dimension of the cell being processed,
+ * as specified by the supplied kernel context.  See rsGetArray0() for an explanation of
+ * the context.
+ *
+ * Returns 0 if the Levels of Detail dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetLod(rs_kernel_context context);
+#endif
+
+#endif // RENDERSCRIPT_RS_FOR_EACH_RSH

diff --git a/24.0.3/include/rs_graphics.rsh b/24.0.3/include/rs_graphics.rsh
new file mode 100644
index 0000000..10ec640
--- /dev/null
+++ b/24.0.3/include/rs_graphics.rsh

@@ -0,0 +1,1522 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_graphics.rsh: Graphics Functions and Types
+ *
+ * The graphics subsystem of RenderScript was removed at API level 23.
+ */
+
+#ifndef RENDERSCRIPT_RS_GRAPHICS_RSH
+#define RENDERSCRIPT_RS_GRAPHICS_RSH
+
+#ifdef __LP64__
+// TODO We need to fix some of the builds before enabling this error:
+// #error "RenderScript graphics is deprecated and not supported in 64bit mode."
+#endif
+
+// TODO we seem to assume order for the other headers too.
+#include "rs_object_types.rsh"
+
+/*
+ * rs_blend_src_func: Blend source function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_BLEND_SRC_ZERO = 0,
+    RS_BLEND_SRC_ONE = 1,
+    RS_BLEND_SRC_DST_COLOR = 2,
+    RS_BLEND_SRC_ONE_MINUS_DST_COLOR = 3,
+    RS_BLEND_SRC_SRC_ALPHA = 4,
+    RS_BLEND_SRC_ONE_MINUS_SRC_ALPHA = 5,
+    RS_BLEND_SRC_DST_ALPHA = 6,
+    RS_BLEND_SRC_ONE_MINUS_DST_ALPHA = 7,
+    RS_BLEND_SRC_SRC_ALPHA_SATURATE = 8,
+    RS_BLEND_SRC_INVALID = 100
+} rs_blend_src_func;
+#endif
+#endif
+
+/*
+ * rs_blend_dst_func: Blend destination function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_BLEND_DST_ZERO = 0,
+    RS_BLEND_DST_ONE = 1,
+    RS_BLEND_DST_SRC_COLOR = 2,
+    RS_BLEND_DST_ONE_MINUS_SRC_COLOR = 3,
+    RS_BLEND_DST_SRC_ALPHA = 4,
+    RS_BLEND_DST_ONE_MINUS_SRC_ALPHA = 5,
+    RS_BLEND_DST_DST_ALPHA = 6,
+    RS_BLEND_DST_ONE_MINUS_DST_ALPHA = 7,
+    RS_BLEND_DST_INVALID = 100
+} rs_blend_dst_func;
+#endif
+#endif
+
+/*
+ * rs_cull_mode: Culling mode
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_CULL_BACK = 0,
+    RS_CULL_FRONT = 1,
+    RS_CULL_NONE = 2,
+    RS_CULL_INVALID = 100
+} rs_cull_mode;
+#endif
+#endif
+
+/*
+ * rs_depth_func: Depth function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Specifies conditional drawing depending on the comparison of the incoming
+ * depth to that found in the depth buffer.
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_DEPTH_FUNC_ALWAYS = 0, // Always drawn
+    RS_DEPTH_FUNC_LESS = 1, // Drawn if the incoming depth value is less than that in the depth buffer
+    RS_DEPTH_FUNC_LEQUAL = 2, // Drawn if the incoming depth value is less or equal to that in the depth buffer
+    RS_DEPTH_FUNC_GREATER = 3, // Drawn if the incoming depth value is greater than that in the depth buffer
+    RS_DEPTH_FUNC_GEQUAL = 4, // Drawn if the incoming depth value is greater or equal to that in the depth buffer
+    RS_DEPTH_FUNC_EQUAL = 5, // Drawn if the incoming depth value is equal to that in the depth buffer
+    RS_DEPTH_FUNC_NOTEQUAL = 6, // Drawn if the incoming depth value is not equal to that in the depth buffer
+    RS_DEPTH_FUNC_INVALID = 100 // Invalid depth function
+} rs_depth_func;
+#endif
+#endif
+
+/*
+ * rs_primitive: How to intepret mesh vertex data
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Describes the way mesh vertex data is interpreted when rendering
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_PRIMITIVE_POINT = 0, // Vertex data will be rendered as a series of points
+    RS_PRIMITIVE_LINE = 1, // Vertex pairs will be rendered as lines
+    RS_PRIMITIVE_LINE_STRIP = 2, // Vertex data will be rendered as a connected line strip
+    RS_PRIMITIVE_TRIANGLE = 3, // Vertices will be rendered as individual triangles
+    RS_PRIMITIVE_TRIANGLE_STRIP = 4, // Vertices will be rendered as a connected triangle strip defined by the first three vertices with each additional triangle defined by a new vertex
+    RS_PRIMITIVE_TRIANGLE_FAN = 5, // Vertices will be rendered as a sequence of triangles that all share first vertex as the origin
+    RS_PRIMITIVE_INVALID = 100 // Invalid primitive
+} rs_primitive;
+#endif
+#endif
+
+/*
+ * rs_font: Handle to a Font
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript font object.
+ * See: android.renderscript.Font
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_font _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_font;
+#endif
+#endif
+
+/*
+ * rs_mesh: Handle to a Mesh
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript mesh object.
+ * See: android.renderscript.Mesh
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_mesh _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_mesh;
+#endif
+#endif
+
+/*
+ * rs_program_fragment: Handle to a ProgramFragment
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramFragment object.
+ * See: android.renderscript.ProgramFragment
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_fragment _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_fragment;
+#endif
+#endif
+
+/*
+ * rs_program_vertex: Handle to a ProgramVertex
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramVertex object.
+ * See: android.renderscript.ProgramVertex
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_vertex _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_vertex;
+#endif
+#endif
+
+/*
+ * rs_program_raster: Handle to a ProgramRaster
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramRaster object.
+ * See: android.renderscript.ProgramRaster
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_raster _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_raster;
+#endif
+#endif
+
+/*
+ * rs_program_store: Handle to a ProgramStore
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramStore object.
+ * See: android.renderscript.ProgramStore
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_store _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_store;
+#endif
+#endif
+
+/*
+ * rsClearObject: Release an object
+ *
+ * Tells the run time that this handle will no longer be used to access the the related
+ * object.  If this was the last handle to that object, resource recovery may happen.
+ *
+ * After calling this function, *dst will be set to an empty handle.  See rsIsObject().
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_mesh* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_fragment* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_vertex* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_raster* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_store* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_font* dst);
+#endif
+#endif
+
+/*
+ * rsIsObject: Check for an empty handle
+ *
+ * Returns true if the handle contains a non-null reference.
+ *
+ * This function does not validate that the internal pointer used in the handle
+ * points to an actual valid object; it only checks for null.
+ *
+ * This function can be used to check the Element returned by rsElementGetSubElement()
+ * or see if rsClearObject() has been called on a handle.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_mesh v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_fragment v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_vertex v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_raster v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_store v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_font v);
+#endif
+#endif
+
+/*
+ * rsSetObject: For internal use.
+ *
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_mesh* dst, rs_mesh src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_fragment* dst, rs_program_fragment src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_vertex* dst, rs_program_vertex src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_raster* dst, rs_program_raster src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_store* dst, rs_program_store src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_font* dst, rs_font src);
+#endif
+#endif
+
+/*
+ * rsgAllocationSyncAll: Sync the contents of an allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Sync the contents of an allocation.
+ *
+ * If the source is specified, sync from memory space specified by source.
+ *
+ * If the source is not specified, sync from its SCRIPT memory space to its HW
+ * memory spaces.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgAllocationSyncAll(rs_allocation alloc);
+#endif
+#endif
+
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgAllocationSyncAll(rs_allocation alloc, rs_allocation_usage_type source);
+#endif
+#endif
+
+/*
+ * rsgBindColorTarget: Set the color target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the color target used for all subsequent rendering calls
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindColorTarget(rs_allocation colorTarget, uint slot);
+#endif
+#endif
+
+/*
+ * rsgBindConstant: Bind a constant allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Allocation object to a ProgramFragment or ProgramVertex.
+ * The Allocation must be a valid constant input for the Program.
+ *
+ * Parameters:
+ *   ps: program fragment object
+ *   slot: index of the constant buffer on the program
+ *   c: constants to bind
+ *   pv: program vertex object
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindConstant(rs_program_fragment ps, uint slot, rs_allocation c);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindConstant(rs_program_vertex pv, uint slot, rs_allocation c);
+#endif
+#endif
+
+/*
+ * rsgBindDepthTarget: Set the depth target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the depth target used for all subsequent rendering calls
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindDepthTarget(rs_allocation depthTarget);
+#endif
+#endif
+
+/*
+ * rsgBindFont: Bind a font object
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Binds the font object to be used for all subsequent font rendering calls
+ *
+ * Parameters:
+ *   font: object to bind
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindFont(rs_font font);
+#endif
+#endif
+
+/*
+ * rsgBindProgramFragment: Bind a ProgramFragment
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramFragment to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramFragment(rs_program_fragment pf);
+#endif
+#endif
+
+/*
+ * rsgBindProgramRaster: Bind a ProgramRaster
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramRaster to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramRaster(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgBindProgramStore: Bind a ProgramStore
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramStore to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramStore(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgBindProgramVertex: Bind a ProgramVertex
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramVertex to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramVertex(rs_program_vertex pv);
+#endif
+#endif
+
+/*
+ * rsgBindSampler: Bind a sampler
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Sampler object to a ProgramFragment.  The sampler will
+ * operate on the texture bound at the matching slot.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindSampler(rs_program_fragment fragment, uint slot, rs_sampler sampler);
+#endif
+#endif
+
+/*
+ * rsgBindTexture: Bind a texture allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid texture for the Program.  The sampling
+ * of the texture will be controled by the Sampler bound at the
+ * matching slot.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindTexture(rs_program_fragment v, uint slot, rs_allocation alloc);
+#endif
+#endif
+
+/*
+ * rsgClearAllRenderTargets: Clear all color and depth targets
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear all color and depth targets and resume rendering into
+ * the framebuffer
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearAllRenderTargets(void);
+#endif
+#endif
+
+/*
+ * rsgClearColor: Clear the specified color from the surface
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clears the rendering surface to the specified color.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearColor(float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgClearColorTarget: Clear the color target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear the previously set color target
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearColorTarget(uint slot);
+#endif
+#endif
+
+/*
+ * rsgClearDepth: Clear the depth surface
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clears the depth suface to the specified value.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearDepth(float value);
+#endif
+#endif
+
+/*
+ * rsgClearDepthTarget: Clear the depth target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear the previously set depth target
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearDepthTarget(void);
+#endif
+#endif
+
+/*
+ * rsgDrawMesh: Draw a mesh
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Draw a mesh using the current context state.
+ *
+ * If primitiveIndex is specified, draw part of a mesh using the current context state.
+ *
+ * If start and len are also specified, draw specified index range of part of a mesh using the current context state.
+ *
+ * Otherwise the whole mesh is rendered.
+ *
+ * Parameters:
+ *   ism: mesh object to render
+ *   primitiveIndex: for meshes that contain multiple primitive groups this parameter specifies the index of the group to draw.
+ *   start: starting index in the range
+ *   len: number of indices to draw
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex, uint start, uint len);
+#endif
+#endif
+
+/*
+ * rsgDrawQuad: Draw a quad
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a simple quad.  Not intended for
+ * drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawQuad(float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3,
+                float z3, float x4, float y4, float z4);
+#endif
+#endif
+
+/*
+ * rsgDrawQuadTexCoords: Draw a textured quad
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a textured quad.  Not intended
+ * for drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawQuadTexCoords(float x1, float y1, float z1, float u1, float v1, float x2, float y2,
+                         float z2, float u2, float v2, float x3, float y3, float z3, float u3,
+                         float v3, float x4, float y4, float z4, float u4, float v4);
+#endif
+#endif
+
+/*
+ * rsgDrawRect: Draw a rectangle
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a simple rectangle.  Not
+ * intended for drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawRect(float x1, float y1, float x2, float y2, float z);
+#endif
+#endif
+
+/*
+ * rsgDrawSpriteScreenspace: Draw rectangles in screenspace
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance function for drawing rectangles in screenspace.  This
+ * function uses the default passthough ProgramVertex.  Any bound ProgramVertex
+ * is ignored.  This function has considerable overhead and should not be used
+ * for drawing in shipping applications.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawSpriteScreenspace(float x, float y, float z, float w, float h);
+#endif
+#endif
+
+/*
+ * rsgDrawText: Draw a text string
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Draws text given a string and location
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawText(const char* text, int x, int y);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawText(rs_allocation alloc, int x, int y);
+#endif
+#endif
+
+/*
+ * rsgFinish: End rendering commands
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Force RenderScript to finish all rendering commands
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgFinish(void);
+#endif
+#endif
+
+/*
+ * rsgFontColor: Set the font color
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Sets the font color for all subsequent rendering calls
+ *
+ * Parameters:
+ *   r: red component
+ *   g: green component
+ *   b: blue component
+ *   a: alpha component
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgFontColor(float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgGetHeight: Get the surface height
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the height of the current rendering surface.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgGetHeight(void);
+#endif
+#endif
+
+/*
+ * rsgGetWidth: Get the surface width
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the width of the current rendering surface.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgGetWidth(void);
+#endif
+#endif
+
+/*
+ * rsgMeasureText: Get the bounding box for a text string
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the bounding box of the text relative to (0, 0)
+ * Any of left, right, top, bottom could be NULL
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeasureText(const char* text, int* left, int* right, int* top, int* bottom);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeasureText(rs_allocation alloc, int* left, int* right, int* top, int* bottom);
+#endif
+#endif
+
+/*
+ * rsgMeshComputeBoundingBox: Compute a bounding box
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Computes an axis aligned bounding box of a mesh object
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float* minX, float* minY, float* min, float* maxX,
+                              float* maxY, float* maxZ);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+static inline void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float3* bBoxMin, float3* bBoxMax) {
+    float x1, y1, z1, x2, y2, z2;
+    rsgMeshComputeBoundingBox(mesh, &x1, &y1, &z1, &x2, &y2, &z2);
+    bBoxMin->x = x1;
+    bBoxMin->y = y1;
+    bBoxMin->z = z1;
+    bBoxMax->x = x2;
+    bBoxMax->y = y2;
+    bBoxMax->z = z2;
+}
+#endif
+#endif
+
+/*
+ * rsgMeshGetIndexAllocation: Return an allocation containing index data
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns an allocation containing index data or a null
+ * allocation if only the primitive is specified
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the index allocation
+ *
+ * Returns: allocation containing index data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetIndexAllocation(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetPrimitive: Return the primitive
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the primitive describing how a part of the mesh is
+ * rendered
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the primitive
+ *
+ * Returns: primitive describing how the mesh is rendered
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_primitive __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetPrimitive(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetPrimitiveCount: Return the number of index sets
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Meshes could have multiple index sets, this function returns
+ * the number.
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *
+ * Returns: number of primitive groups in the mesh. This would include simple primitives as well as allocations containing index data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint32_t __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetPrimitiveCount(rs_mesh m);
+#endif
+#endif
+
+/*
+ * rsgMeshGetVertexAllocation: Return a vertex allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns an allocation that is part of the mesh and contains
+ * vertex data, e.g. positions, normals, texcoords
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the vertex allocation
+ *
+ * Returns: allocation containing vertex data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetVertexAllocation(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetVertexAllocationCount: Return the number of vertex allocations
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the number of allocations in the mesh that contain
+ * vertex data
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *
+ * Returns: number of allocations in the mesh that contain vertex data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint32_t __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetVertexAllocationCount(rs_mesh m);
+#endif
+#endif
+
+/*
+ * rsgProgramFragmentConstantColor: Set the constant color for a fixed function emulation program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the constant color for a fixed function emulation program.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramFragmentConstantColor(rs_program_fragment pf, float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexGetProjectionMatrix: Get the projection matrix for a fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   proj: matrix to store the current projection matrix into
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexGetProjectionMatrix(rs_matrix4x4* proj);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadModelMatrix: Load the model matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the model matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   model: model matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadModelMatrix(const rs_matrix4x4* model);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadProjectionMatrix: Load the projection matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   proj: projection matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadProjectionMatrix(const rs_matrix4x4* proj);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadTextureMatrix: Load the texture matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the texture matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   tex: texture matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadTextureMatrix(const rs_matrix4x4* tex);
+#endif
+#endif
+
+/*
+ * rsgProgramRasterGetCullMode: Get program raster cull mode
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program raster cull mode
+ *
+ * Parameters:
+ *   pr: program raster to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_cull_mode __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramRasterGetCullMode(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgProgramRasterIsPointSpriteEnabled: Get program raster point sprite state
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program raster point sprite state
+ *
+ * Parameters:
+ *   pr: program raster to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramRasterIsPointSpriteEnabled(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetBlendDstFunc: Get program store blend destination function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blend destination function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_blend_dst_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetBlendDstFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetBlendSrcFunc: Get program store blend source function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blend source function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_blend_src_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetBlendSrcFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetDepthFunc: Get program store depth function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store depth function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_depth_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetDepthFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskAlphaEnabled: Get program store alpha component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store alpha component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskAlphaEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskBlueEnabled: Get program store blur component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blur component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskBlueEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskGreenEnabled: Get program store green component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store green component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskGreenEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskRedEnabled: Get program store red component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store red component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskRedEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsDepthMaskEnabled: Get program store depth mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store depth mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsDepthMaskEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsDitherEnabled: Get program store dither state
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store dither state
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsDitherEnabled(rs_program_store ps);
+#endif
+#endif
+
+#endif // RENDERSCRIPT_RS_GRAPHICS_RSH

diff --git a/24.0.3/include/rs_io.rsh b/24.0.3/include/rs_io.rsh
new file mode 100644
index 0000000..2ffbe4b
--- /dev/null
+++ b/24.0.3/include/rs_io.rsh

@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_io.rsh: Input/Output Functions
+ *
+ * These functions are used to:
+ * - Send information to the Java client, and
+ * - Send the processed allocation or receive the next allocation to process.
+ */
+
+#ifndef RENDERSCRIPT_RS_IO_RSH
+#define RENDERSCRIPT_RS_IO_RSH
+
+/*
+ * rsAllocationIoReceive: Receive new content from the queue
+ *
+ * Receive a new set of contents from the queue.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   a: Allocation to work on.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern void __attribute__((overloadable))
+    rsAllocationIoReceive(rs_allocation a);
+#endif
+
+/*
+ * rsAllocationIoSend: Send new content to the queue
+ *
+ * Send the contents of the Allocation to the queue.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   a: Allocation to work on.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern void __attribute__((overloadable))
+    rsAllocationIoSend(rs_allocation a);
+#endif
+
+/*
+ * rsSendToClient: Send a message to the client, non-blocking
+ *
+ * Sends a message back to the client.  This call does not block.
+ * It returns true if the message was sent and false if the
+ * message queue is full.
+ *
+ * A message ID is required.  The data payload is optional.
+ *
+ * See RenderScript.RSMessageHandler.
+ *
+ * Parameters:
+ *   data: Application specific data.
+ *   len: Length of the data, in bytes.
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID);
+
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID, const void* data, uint len);
+
+/*
+ * rsSendToClientBlocking: Send a message to the client, blocking
+ *
+ * Sends a message back to the client.  This function will block
+ * until there is room on the message queue for this message.
+ * This function may return before the message was delivered and
+ * processed by the client.
+ *
+ * A message ID is required.  The data payload is optional.
+ *
+ * See RenderScript.RSMessageHandler.
+ *
+ * Parameters:
+ *   data: Application specific data.
+ *   len: Length of the data, in bytes.
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID);
+
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID, const void* data, uint len);
+
+#endif // RENDERSCRIPT_RS_IO_RSH

diff --git a/24.0.3/include/rs_math.rsh b/24.0.3/include/rs_math.rsh
new file mode 100644
index 0000000..3d034d0
--- /dev/null
+++ b/24.0.3/include/rs_math.rsh

@@ -0,0 +1,6550 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_math.rsh: Mathematical Constants and Functions
+ *
+ * The mathematical functions below can be applied to scalars and vectors.   When applied
+ * to vectors, the returned value is a vector of the function applied to each entry of the input.
+ *
+ * For example:
+ * float3 a, b;
+ * // The following call sets
+ * //   a.x to sin(b.x),
+ * //   a.y to sin(b.y), and
+ * //   a.z to sin(b.z).
+ * a = sin(b);
+ *
+ *
+ * See Vector Math Functions for functions like distance() and length() that interpret
+ * instead the input as a single vector in n-dimensional space.
+ *
+ * The precision of the mathematical operations on 32 bit floats is affected by the pragmas
+ * rs_fp_relaxed and rs_fp_full.  Under rs_fp_relaxed, subnormal values may be flushed to zero and
+ * rounding may be done towards zero.  In comparison, rs_fp_full requires correct handling of
+ * subnormal values, i.e. smaller than 1.17549435e-38f.  rs_fp_rull also requires round to nearest
+ * with ties to even.
+ *
+ * Different precision/speed tradeoffs can be achieved by using variants of the common math
+ * functions.  Functions with a name starting with
+ * - native_: May have custom hardware implementations with weaker precision.  Additionally,
+ *   subnormal values may be flushed to zero, rounding towards zero may be used, and NaN and
+ *   infinity input may not be handled correctly.
+ * - half_: May perform internal computations using 16 bit floats.  Additionally, subnormal
+ *   values may be flushed to zero, and rounding towards zero may be used.
+ *
+ */
+
+#ifndef RENDERSCRIPT_RS_MATH_RSH
+#define RENDERSCRIPT_RS_MATH_RSH
+
+/*
+ * M_1_PI: 1 / pi, as a 32 bit float
+ *
+ * The inverse of pi, as a 32 bit float.
+ */
+#define M_1_PI 0.318309886183790671537767526745028724f
+
+/*
+ * M_2_PI: 2 / pi, as a 32 bit float
+ *
+ * 2 divided by pi, as a 32 bit float.
+ */
+#define M_2_PI 0.636619772367581343075535053490057448f
+
+/*
+ * M_2_PIl: 2 / pi, as a 32 bit float
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * 2 divided by pi, as a 32 bit float.
+ */
+#define M_2_PIl 0.636619772367581343075535053490057448f
+
+/*
+ * M_2_SQRTPI: 2 / sqrt(pi), as a 32 bit float
+ *
+ * 2 divided by the square root of pi, as a 32 bit float.
+ */
+#define M_2_SQRTPI 1.128379167095512573896158903121545172f
+
+/*
+ * M_E: e, as a 32 bit float
+ *
+ * The number e, the base of the natural logarithm, as a 32 bit float.
+ */
+#define M_E 2.718281828459045235360287471352662498f
+
+/*
+ * M_LN10: log_e(10), as a 32 bit float
+ *
+ * The natural logarithm of 10, as a 32 bit float.
+ */
+#define M_LN10 2.302585092994045684017991454684364208f
+
+/*
+ * M_LN2: log_e(2), as a 32 bit float
+ *
+ * The natural logarithm of 2, as a 32 bit float.
+ */
+#define M_LN2 0.693147180559945309417232121458176568f
+
+/*
+ * M_LOG10E: log_10(e), as a 32 bit float
+ *
+ * The logarithm base 10 of e, as a 32 bit float.
+ */
+#define M_LOG10E 0.434294481903251827651128918916605082f
+
+/*
+ * M_LOG2E: log_2(e), as a 32 bit float
+ *
+ * The logarithm base 2 of e, as a 32 bit float.
+ */
+#define M_LOG2E 1.442695040888963407359924681001892137f
+
+/*
+ * M_PI: pi, as a 32 bit float
+ *
+ * The constant pi, as a 32 bit float.
+ */
+#define M_PI 3.141592653589793238462643383279502884f
+
+/*
+ * M_PI_2: pi / 2, as a 32 bit float
+ *
+ * Pi divided by 2, as a 32 bit float.
+ */
+#define M_PI_2 1.570796326794896619231321691639751442f
+
+/*
+ * M_PI_4: pi / 4, as a 32 bit float
+ *
+ * Pi divided by 4, as a 32 bit float.
+ */
+#define M_PI_4 0.785398163397448309615660845819875721f
+
+/*
+ * M_SQRT1_2: 1 / sqrt(2), as a 32 bit float
+ *
+ * The inverse of the square root of 2, as a 32 bit float.
+ */
+#define M_SQRT1_2 0.707106781186547524400844362104849039f
+
+/*
+ * M_SQRT2: sqrt(2), as a 32 bit float
+ *
+ * The square root of 2, as a 32 bit float.
+ */
+#define M_SQRT2 1.414213562373095048801688724209698079f
+
+/*
+ * abs: Absolute value of an integer
+ *
+ * Returns the absolute value of an integer.
+ *
+ * For floats, use fabs().
+ */
+extern uchar __attribute__((const, overloadable))
+    abs(char v);
+
+extern uchar2 __attribute__((const, overloadable))
+    abs(char2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    abs(char3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    abs(char4 v);
+
+extern ushort __attribute__((const, overloadable))
+    abs(short v);
+
+extern ushort2 __attribute__((const, overloadable))
+    abs(short2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    abs(short3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    abs(short4 v);
+
+extern uint __attribute__((const, overloadable))
+    abs(int v);
+
+extern uint2 __attribute__((const, overloadable))
+    abs(int2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    abs(int3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    abs(int4 v);
+
+/*
+ * acos: Inverse cosine
+ *
+ * Returns the inverse cosine, in radians.
+ *
+ * See also native_acos().
+ */
+extern float __attribute__((const, overloadable))
+    acos(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acos(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acos(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acos(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    acos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    acos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    acos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    acos(half4 v);
+#endif
+
+/*
+ * acosh: Inverse hyperbolic cosine
+ *
+ * Returns the inverse hyperbolic cosine, in radians.
+ *
+ * See also native_acosh().
+ */
+extern float __attribute__((const, overloadable))
+    acosh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acosh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acosh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acosh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    acosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    acosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    acosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    acosh(half4 v);
+#endif
+
+/*
+ * acospi: Inverse cosine divided by pi
+ *
+ * Returns the inverse cosine in radians, divided by pi.
+ *
+ * To get an inverse cosine measured in degrees, use acospi(a) * 180.f.
+ *
+ * See also native_acospi().
+ */
+extern float __attribute__((const, overloadable))
+    acospi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acospi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acospi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acospi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    acospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    acospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    acospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    acospi(half4 v);
+#endif
+
+/*
+ * asin: Inverse sine
+ *
+ * Returns the inverse sine, in radians.
+ *
+ * See also native_asin().
+ */
+extern float __attribute__((const, overloadable))
+    asin(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asin(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asin(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asin(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    asin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    asin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    asin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    asin(half4 v);
+#endif
+
+/*
+ * asinh: Inverse hyperbolic sine
+ *
+ * Returns the inverse hyperbolic sine, in radians.
+ *
+ * See also native_asinh().
+ */
+extern float __attribute__((const, overloadable))
+    asinh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asinh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asinh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asinh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    asinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    asinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    asinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    asinh(half4 v);
+#endif
+
+/*
+ * asinpi: Inverse sine divided by pi
+ *
+ * Returns the inverse sine in radians, divided by pi.
+ *
+ * To get an inverse sine measured in degrees, use asinpi(a) * 180.f.
+ *
+ * See also native_asinpi().
+ */
+extern float __attribute__((const, overloadable))
+    asinpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asinpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asinpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asinpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    asinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    asinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    asinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    asinpi(half4 v);
+#endif
+
+/*
+ * atan: Inverse tangent
+ *
+ * Returns the inverse tangent, in radians.
+ *
+ * See also native_atan().
+ */
+extern float __attribute__((const, overloadable))
+    atan(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atan(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atan(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atan(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atan(half4 v);
+#endif
+
+/*
+ * atan2: Inverse tangent of a ratio
+ *
+ * Returns the inverse tangent of (numerator / denominator), in radians.
+ *
+ * See also native_atan2().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+extern float __attribute__((const, overloadable))
+    atan2(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    atan2(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    atan2(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    atan2(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atan2(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atan2(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atan2(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atan2(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * atan2pi: Inverse tangent of a ratio, divided by pi
+ *
+ * Returns the inverse tangent of (numerator / denominator), in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atan2pi(n, d) * 180.f.
+ *
+ * See also native_atan2pi().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+extern float __attribute__((const, overloadable))
+    atan2pi(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    atan2pi(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    atan2pi(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    atan2pi(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atan2pi(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atan2pi(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atan2pi(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atan2pi(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * atanh: Inverse hyperbolic tangent
+ *
+ * Returns the inverse hyperbolic tangent, in radians.
+ *
+ * See also native_atanh().
+ */
+extern float __attribute__((const, overloadable))
+    atanh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atanh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atanh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atanh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atanh(half4 v);
+#endif
+
+/*
+ * atanpi: Inverse tangent divided by pi
+ *
+ * Returns the inverse tangent in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atanpi(a) * 180.f.
+ *
+ * See also native_atanpi().
+ */
+extern float __attribute__((const, overloadable))
+    atanpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atanpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atanpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atanpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atanpi(half4 v);
+#endif
+
+/*
+ * cbrt: Cube root
+ *
+ * Returns the cube root.
+ *
+ * See also native_cbrt().
+ */
+extern float __attribute__((const, overloadable))
+    cbrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cbrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cbrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cbrt(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cbrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cbrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cbrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cbrt(half4 v);
+#endif
+
+/*
+ * ceil: Smallest integer not less than a value
+ *
+ * Returns the smallest integer not less than a value.
+ *
+ * For example, ceil(1.2f) returns 2.f, and ceil(-1.2f) returns -1.f.
+ *
+ * See also floor().
+ */
+extern float __attribute__((const, overloadable))
+    ceil(float v);
+
+extern float2 __attribute__((const, overloadable))
+    ceil(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    ceil(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    ceil(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    ceil(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    ceil(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    ceil(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    ceil(half4 v);
+#endif
+
+/*
+ * clamp: Restrain a value to a range
+ *
+ * Clamps a value to a specified high and low bound.  clamp() returns min_value
+ * if value < min_value, max_value if value > max_value, otherwise value.
+ *
+ * There are two variants of clamp: one where the min and max are scalars applied
+ * to all entries of the value, the other where the min and max are also vectors.
+ *
+ * If min_value is greater than max_value, the results are undefined.
+ *
+ * Parameters:
+ *   value: Value to be clamped.
+ *   min_value: Lower bound, a scalar or matching vector.
+ *   max_value: High bound, must match the type of low.
+ */
+extern float __attribute__((const, overloadable))
+    clamp(float value, float min_value, float max_value);
+
+extern float2 __attribute__((const, overloadable))
+    clamp(float2 value, float2 min_value, float2 max_value);
+
+extern float3 __attribute__((const, overloadable))
+    clamp(float3 value, float3 min_value, float3 max_value);
+
+extern float4 __attribute__((const, overloadable))
+    clamp(float4 value, float4 min_value, float4 max_value);
+
+extern float2 __attribute__((const, overloadable))
+    clamp(float2 value, float min_value, float max_value);
+
+extern float3 __attribute__((const, overloadable))
+    clamp(float3 value, float min_value, float max_value);
+
+extern float4 __attribute__((const, overloadable))
+    clamp(float4 value, float min_value, float max_value);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char __attribute__((const, overloadable))
+    clamp(char value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char2 __attribute__((const, overloadable))
+    clamp(char2 value, char2 min_value, char2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char3 __attribute__((const, overloadable))
+    clamp(char3 value, char3 min_value, char3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char4 __attribute__((const, overloadable))
+    clamp(char4 value, char4 min_value, char4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar __attribute__((const, overloadable))
+    clamp(uchar value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar2 __attribute__((const, overloadable))
+    clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar3 __attribute__((const, overloadable))
+    clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar4 __attribute__((const, overloadable))
+    clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short __attribute__((const, overloadable))
+    clamp(short value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short2 __attribute__((const, overloadable))
+    clamp(short2 value, short2 min_value, short2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short3 __attribute__((const, overloadable))
+    clamp(short3 value, short3 min_value, short3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short4 __attribute__((const, overloadable))
+    clamp(short4 value, short4 min_value, short4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort __attribute__((const, overloadable))
+    clamp(ushort value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort2 __attribute__((const, overloadable))
+    clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort3 __attribute__((const, overloadable))
+    clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort4 __attribute__((const, overloadable))
+    clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int __attribute__((const, overloadable))
+    clamp(int value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int2 __attribute__((const, overloadable))
+    clamp(int2 value, int2 min_value, int2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int3 __attribute__((const, overloadable))
+    clamp(int3 value, int3 min_value, int3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int4 __attribute__((const, overloadable))
+    clamp(int4 value, int4 min_value, int4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint __attribute__((const, overloadable))
+    clamp(uint value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint2 __attribute__((const, overloadable))
+    clamp(uint2 value, uint2 min_value, uint2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint3 __attribute__((const, overloadable))
+    clamp(uint3 value, uint3 min_value, uint3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint4 __attribute__((const, overloadable))
+    clamp(uint4 value, uint4 min_value, uint4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long __attribute__((const, overloadable))
+    clamp(long value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long2 __attribute__((const, overloadable))
+    clamp(long2 value, long2 min_value, long2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long3 __attribute__((const, overloadable))
+    clamp(long3 value, long3 min_value, long3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long4 __attribute__((const, overloadable))
+    clamp(long4 value, long4 min_value, long4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong __attribute__((const, overloadable))
+    clamp(ulong value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong2 __attribute__((const, overloadable))
+    clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong3 __attribute__((const, overloadable))
+    clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong4 __attribute__((const, overloadable))
+    clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char2 __attribute__((const, overloadable))
+    clamp(char2 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char3 __attribute__((const, overloadable))
+    clamp(char3 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char4 __attribute__((const, overloadable))
+    clamp(char4 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar2 __attribute__((const, overloadable))
+    clamp(uchar2 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar3 __attribute__((const, overloadable))
+    clamp(uchar3 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar4 __attribute__((const, overloadable))
+    clamp(uchar4 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short2 __attribute__((const, overloadable))
+    clamp(short2 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short3 __attribute__((const, overloadable))
+    clamp(short3 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short4 __attribute__((const, overloadable))
+    clamp(short4 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort2 __attribute__((const, overloadable))
+    clamp(ushort2 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort3 __attribute__((const, overloadable))
+    clamp(ushort3 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort4 __attribute__((const, overloadable))
+    clamp(ushort4 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int2 __attribute__((const, overloadable))
+    clamp(int2 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int3 __attribute__((const, overloadable))
+    clamp(int3 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int4 __attribute__((const, overloadable))
+    clamp(int4 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint2 __attribute__((const, overloadable))
+    clamp(uint2 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint3 __attribute__((const, overloadable))
+    clamp(uint3 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint4 __attribute__((const, overloadable))
+    clamp(uint4 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long2 __attribute__((const, overloadable))
+    clamp(long2 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long3 __attribute__((const, overloadable))
+    clamp(long3 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long4 __attribute__((const, overloadable))
+    clamp(long4 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong2 __attribute__((const, overloadable))
+    clamp(ulong2 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong3 __attribute__((const, overloadable))
+    clamp(ulong3 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong4 __attribute__((const, overloadable))
+    clamp(ulong4 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    clamp(half value, half min_value, half max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    clamp(half2 value, half2 min_value, half2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    clamp(half3 value, half3 min_value, half3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    clamp(half4 value, half4 min_value, half4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    clamp(half2 value, half min_value, half max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    clamp(half3 value, half min_value, half max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    clamp(half4 value, half min_value, half max_value);
+#endif
+
+/*
+ * clz: Number of leading 0 bits
+ *
+ * Returns the number of leading 0-bits in a value.
+ *
+ * For example, clz((char)0x03) returns 6.
+ */
+extern char __attribute__((const, overloadable))
+    clz(char value);
+
+extern char2 __attribute__((const, overloadable))
+    clz(char2 value);
+
+extern char3 __attribute__((const, overloadable))
+    clz(char3 value);
+
+extern char4 __attribute__((const, overloadable))
+    clz(char4 value);
+
+extern uchar __attribute__((const, overloadable))
+    clz(uchar value);
+
+extern uchar2 __attribute__((const, overloadable))
+    clz(uchar2 value);
+
+extern uchar3 __attribute__((const, overloadable))
+    clz(uchar3 value);
+
+extern uchar4 __attribute__((const, overloadable))
+    clz(uchar4 value);
+
+extern short __attribute__((const, overloadable))
+    clz(short value);
+
+extern short2 __attribute__((const, overloadable))
+    clz(short2 value);
+
+extern short3 __attribute__((const, overloadable))
+    clz(short3 value);
+
+extern short4 __attribute__((const, overloadable))
+    clz(short4 value);
+
+extern ushort __attribute__((const, overloadable))
+    clz(ushort value);
+
+extern ushort2 __attribute__((const, overloadable))
+    clz(ushort2 value);
+
+extern ushort3 __attribute__((const, overloadable))
+    clz(ushort3 value);
+
+extern ushort4 __attribute__((const, overloadable))
+    clz(ushort4 value);
+
+extern int __attribute__((const, overloadable))
+    clz(int value);
+
+extern int2 __attribute__((const, overloadable))
+    clz(int2 value);
+
+extern int3 __attribute__((const, overloadable))
+    clz(int3 value);
+
+extern int4 __attribute__((const, overloadable))
+    clz(int4 value);
+
+extern uint __attribute__((const, overloadable))
+    clz(uint value);
+
+extern uint2 __attribute__((const, overloadable))
+    clz(uint2 value);
+
+extern uint3 __attribute__((const, overloadable))
+    clz(uint3 value);
+
+extern uint4 __attribute__((const, overloadable))
+    clz(uint4 value);
+
+/*
+ * copysign: Copies the sign of a number to another
+ *
+ * Copies the sign from sign_value to magnitude_value.
+ *
+ * The value returned is either magnitude_value or -magnitude_value.
+ *
+ * For example, copysign(4.0f, -2.7f) returns -4.0f and copysign(-4.0f, 2.7f) returns 4.0f.
+ */
+extern float __attribute__((const, overloadable))
+    copysign(float magnitude_value, float sign_value);
+
+extern float2 __attribute__((const, overloadable))
+    copysign(float2 magnitude_value, float2 sign_value);
+
+extern float3 __attribute__((const, overloadable))
+    copysign(float3 magnitude_value, float3 sign_value);
+
+extern float4 __attribute__((const, overloadable))
+    copysign(float4 magnitude_value, float4 sign_value);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    copysign(half magnitude_value, half sign_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    copysign(half2 magnitude_value, half2 sign_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    copysign(half3 magnitude_value, half3 sign_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    copysign(half4 magnitude_value, half4 sign_value);
+#endif
+
+/*
+ * cos: Cosine
+ *
+ * Returns the cosine of an angle measured in radians.
+ *
+ * See also native_cos().
+ */
+extern float __attribute__((const, overloadable))
+    cos(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cos(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cos(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cos(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cos(half4 v);
+#endif
+
+/*
+ * cosh: Hypebolic cosine
+ *
+ * Returns the hypebolic cosine of v, where v is measured in radians.
+ *
+ * See also native_cosh().
+ */
+extern float __attribute__((const, overloadable))
+    cosh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cosh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cosh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cosh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cosh(half4 v);
+#endif
+
+/*
+ * cospi: Cosine of a number multiplied by pi
+ *
+ * Returns the cosine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the cosine of a value measured in degrees, call cospi(v / 180.f).
+ *
+ * See also native_cospi().
+ */
+extern float __attribute__((const, overloadable))
+    cospi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cospi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cospi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cospi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cospi(half4 v);
+#endif
+
+/*
+ * degrees: Converts radians into degrees
+ *
+ * Converts from radians to degrees.
+ */
+extern float __attribute__((const, overloadable))
+    degrees(float v);
+
+extern float2 __attribute__((const, overloadable))
+    degrees(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    degrees(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    degrees(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    degrees(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    degrees(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    degrees(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    degrees(half4 v);
+#endif
+
+/*
+ * erf: Mathematical error function
+ *
+ * Returns the error function.
+ */
+extern float __attribute__((const, overloadable))
+    erf(float v);
+
+extern float2 __attribute__((const, overloadable))
+    erf(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    erf(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    erf(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    erf(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    erf(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    erf(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    erf(half4 v);
+#endif
+
+/*
+ * erfc: Mathematical complementary error function
+ *
+ * Returns the complementary error function.
+ */
+extern float __attribute__((const, overloadable))
+    erfc(float v);
+
+extern float2 __attribute__((const, overloadable))
+    erfc(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    erfc(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    erfc(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    erfc(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    erfc(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    erfc(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    erfc(half4 v);
+#endif
+
+/*
+ * exp: e raised to a number
+ *
+ * Returns e raised to v, i.e. e ^ v.
+ *
+ * See also native_exp().
+ */
+extern float __attribute__((const, overloadable))
+    exp(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    exp(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    exp(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    exp(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    exp(half4 v);
+#endif
+
+/*
+ * exp10: 10 raised to a number
+ *
+ * Returns 10 raised to v, i.e. 10.f ^ v.
+ *
+ * See also native_exp10().
+ */
+extern float __attribute__((const, overloadable))
+    exp10(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp10(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp10(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp10(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    exp10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    exp10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    exp10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    exp10(half4 v);
+#endif
+
+/*
+ * exp2: 2 raised to a number
+ *
+ * Returns 2 raised to v, i.e. 2.f ^ v.
+ *
+ * See also native_exp2().
+ */
+extern float __attribute__((const, overloadable))
+    exp2(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp2(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp2(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    exp2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    exp2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    exp2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    exp2(half4 v);
+#endif
+
+/*
+ * expm1: e raised to a number minus one
+ *
+ * Returns e raised to v minus 1, i.e. (e ^ v) - 1.
+ *
+ * See also native_expm1().
+ */
+extern float __attribute__((const, overloadable))
+    expm1(float v);
+
+extern float2 __attribute__((const, overloadable))
+    expm1(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    expm1(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    expm1(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    expm1(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    expm1(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    expm1(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    expm1(half4 v);
+#endif
+
+/*
+ * fabs: Absolute value of a float
+ *
+ * Returns the absolute value of the float v.
+ *
+ * For integers, use abs().
+ */
+extern float __attribute__((const, overloadable))
+    fabs(float v);
+
+extern float2 __attribute__((const, overloadable))
+    fabs(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    fabs(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    fabs(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fabs(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fabs(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fabs(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fabs(half4 v);
+#endif
+
+/*
+ * fdim: Positive difference between two values
+ *
+ * Returns the positive difference between two values.
+ *
+ * If a > b, returns (a - b) otherwise returns 0f.
+ */
+extern float __attribute__((const, overloadable))
+    fdim(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fdim(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fdim(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fdim(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fdim(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fdim(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fdim(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fdim(half4 a, half4 b);
+#endif
+
+/*
+ * floor: Smallest integer not greater than a value
+ *
+ * Returns the smallest integer not greater than a value.
+ *
+ * For example, floor(1.2f) returns 1.f, and floor(-1.2f) returns -2.f.
+ *
+ * See also ceil().
+ */
+extern float __attribute__((const, overloadable))
+    floor(float v);
+
+extern float2 __attribute__((const, overloadable))
+    floor(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    floor(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    floor(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    floor(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    floor(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    floor(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    floor(half4 v);
+#endif
+
+/*
+ * fma: Multiply and add
+ *
+ * Multiply and add.  Returns (multiplicand1 * multiplicand2) + offset.
+ *
+ * This function is similar to mad().  fma() retains full precision of the multiplied result
+ * and rounds only after the addition.  mad() rounds after the multiplication and the addition.
+ * This extra precision is not guaranteed in rs_fp_relaxed mode.
+ */
+extern float __attribute__((const, overloadable))
+    fma(float multiplicand1, float multiplicand2, float offset);
+
+extern float2 __attribute__((const, overloadable))
+    fma(float2 multiplicand1, float2 multiplicand2, float2 offset);
+
+extern float3 __attribute__((const, overloadable))
+    fma(float3 multiplicand1, float3 multiplicand2, float3 offset);
+
+extern float4 __attribute__((const, overloadable))
+    fma(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fma(half multiplicand1, half multiplicand2, half offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fma(half2 multiplicand1, half2 multiplicand2, half2 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fma(half3 multiplicand1, half3 multiplicand2, half3 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fma(half4 multiplicand1, half4 multiplicand2, half4 offset);
+#endif
+
+/*
+ * fmax: Maximum of two floats
+ *
+ * Returns the maximum of a and b, i.e. (a < b ? b : a).
+ *
+ * The max() function returns identical results but can be applied to more data types.
+ */
+extern float __attribute__((const, overloadable))
+    fmax(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fmax(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fmax(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fmax(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fmax(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmax(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmax(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmax(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    fmax(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    fmax(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    fmax(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmax(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmax(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmax(half4 a, half b);
+#endif
+
+/*
+ * fmin: Minimum of two floats
+ *
+ * Returns the minimum of a and b, i.e. (a > b ? b : a).
+ *
+ * The min() function returns identical results but can be applied to more data types.
+ */
+extern float __attribute__((const, overloadable))
+    fmin(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fmin(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fmin(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fmin(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fmin(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmin(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmin(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmin(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    fmin(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    fmin(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    fmin(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmin(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmin(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmin(half4 a, half b);
+#endif
+
+/*
+ * fmod: Modulo
+ *
+ * Returns the remainder of (numerator / denominator), where the quotient is rounded towards zero.
+ *
+ * The function remainder() is similar but rounds toward the closest interger.
+ * For example, fmod(-3.8f, 2.f) returns -1.8f (-3.8f - -1.f * 2.f)
+ * while remainder(-3.8f, 2.f) returns 0.2f (-3.8f - -2.f * 2.f).
+ */
+extern float __attribute__((const, overloadable))
+    fmod(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    fmod(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    fmod(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    fmod(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fmod(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmod(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmod(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmod(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * fract: Positive fractional part
+ *
+ * Returns the positive fractional part of v, i.e. v - floor(v).
+ *
+ * For example, fract(1.3f, &val) returns 0.3f and sets val to 1.f.
+ * fract(-1.3f, &val) returns 0.7f and sets val to -2.f.
+ *
+ * Parameters:
+ *   v: Input value.
+ *   floor: If floor is not null, *floor will be set to the floor of v.
+ */
+extern float __attribute__((overloadable))
+    fract(float v, float* floor);
+
+extern float2 __attribute__((overloadable))
+    fract(float2 v, float2* floor);
+
+extern float3 __attribute__((overloadable))
+    fract(float3 v, float3* floor);
+
+extern float4 __attribute__((overloadable))
+    fract(float4 v, float4* floor);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float __attribute__((const, overloadable))
+    fract(float v) {
+    float unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float2 __attribute__((const, overloadable))
+    fract(float2 v) {
+    float2 unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float3 __attribute__((const, overloadable))
+    fract(float3 v) {
+    float3 unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float4 __attribute__((const, overloadable))
+    fract(float4 v) {
+    float4 unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float __attribute__((overloadable))
+    fract(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float2 __attribute__((overloadable))
+    fract(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float3 __attribute__((overloadable))
+    fract(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float4 __attribute__((overloadable))
+    fract(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    fract(half v, half* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    fract(half2 v, half2* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    fract(half3 v, half3* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    fract(half4 v, half4* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    fract(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    fract(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    fract(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    fract(half4 v);
+#endif
+
+/*
+ * frexp: Binary mantissa and exponent
+ *
+ * Returns the binary mantissa and exponent of v, i.e. v == mantissa * 2 ^ exponent.
+ *
+ * The mantissa is always between 0.5 (inclusive) and 1.0 (exclusive).
+ *
+ * See ldexp() for the reverse operation.  See also logb() and ilogb().
+ *
+ * Parameters:
+ *   v: Input value.
+ *   exponent: If exponent is not null, *exponent will be set to the exponent of v.
+ */
+extern float __attribute__((overloadable))
+    frexp(float v, int* exponent);
+
+extern float2 __attribute__((overloadable))
+    frexp(float2 v, int2* exponent);
+
+extern float3 __attribute__((overloadable))
+    frexp(float3 v, int3* exponent);
+
+extern float4 __attribute__((overloadable))
+    frexp(float4 v, int4* exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    frexp(half v, int* exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    frexp(half2 v, int2* exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    frexp(half3 v, int3* exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    frexp(half4 v, int4* exponent);
+#endif
+
+/*
+ * half_recip: Reciprocal computed to 16 bit precision
+ *
+ * Returns the approximate reciprocal of a value.
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also native_recip().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_recip(float4 v);
+#endif
+
+/*
+ * half_rsqrt: Reciprocal of a square root computed to 16 bit precision
+ *
+ * Returns the approximate value of (1.f / sqrt(value)).
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also rsqrt(), native_rsqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_rsqrt(float4 v);
+#endif
+
+/*
+ * half_sqrt: Square root computed to 16 bit precision
+ *
+ * Returns the approximate square root of a value.
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also sqrt(), native_sqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_sqrt(float4 v);
+#endif
+
+/*
+ * hypot: Hypotenuse
+ *
+ * Returns the hypotenuse, i.e. sqrt(a * a + b * b).
+ *
+ * See also native_hypot().
+ */
+extern float __attribute__((const, overloadable))
+    hypot(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    hypot(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    hypot(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    hypot(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    hypot(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    hypot(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    hypot(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    hypot(half4 a, half4 b);
+#endif
+
+/*
+ * ilogb: Base two exponent
+ *
+ * Returns the base two exponent of a value, where the mantissa is between
+ * 1.f (inclusive) and 2.f (exclusive).
+ *
+ * For example, ilogb(8.5f) returns 3.
+ *
+ * Because of the difference in mantissa, this number is one less than is returned by frexp().
+ *
+ * logb() is similar but returns a float.
+ */
+extern int __attribute__((const, overloadable))
+    ilogb(float v);
+
+extern int2 __attribute__((const, overloadable))
+    ilogb(float2 v);
+
+extern int3 __attribute__((const, overloadable))
+    ilogb(float3 v);
+
+extern int4 __attribute__((const, overloadable))
+    ilogb(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int __attribute__((const, overloadable))
+    ilogb(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int2 __attribute__((const, overloadable))
+    ilogb(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int3 __attribute__((const, overloadable))
+    ilogb(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int4 __attribute__((const, overloadable))
+    ilogb(half4 v);
+#endif
+
+/*
+ * ldexp: Creates a floating point from mantissa and exponent
+ *
+ * Returns the floating point created from the mantissa and exponent,
+ * i.e. (mantissa * 2 ^ exponent).
+ *
+ * See frexp() for the reverse operation.
+ *
+ * Parameters:
+ *   mantissa: Mantissa.
+ *   exponent: Exponent, a single component or matching vector.
+ */
+extern float __attribute__((const, overloadable))
+    ldexp(float mantissa, int exponent);
+
+extern float2 __attribute__((const, overloadable))
+    ldexp(float2 mantissa, int2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    ldexp(float3 mantissa, int3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    ldexp(float4 mantissa, int4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    ldexp(half mantissa, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    ldexp(half2 mantissa, int2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    ldexp(half3 mantissa, int3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    ldexp(half4 mantissa, int4 exponent);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    ldexp(float2 mantissa, int exponent);
+
+extern float3 __attribute__((const, overloadable))
+    ldexp(float3 mantissa, int exponent);
+
+extern float4 __attribute__((const, overloadable))
+    ldexp(float4 mantissa, int exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    ldexp(half2 mantissa, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    ldexp(half3 mantissa, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    ldexp(half4 mantissa, int exponent);
+#endif
+
+/*
+ * lgamma: Natural logarithm of the gamma function
+ *
+ * Returns the natural logarithm of the absolute value of the gamma function,
+ * i.e. log(fabs(tgamma(v))).
+ *
+ * See also tgamma().
+ *
+ * Parameters:
+ *   sign_of_gamma: If sign_of_gamma is not null, *sign_of_gamma will be set to -1.f if the gamma of v is negative, otherwise to 1.f.
+ */
+extern float __attribute__((const, overloadable))
+    lgamma(float v);
+
+extern float2 __attribute__((const, overloadable))
+    lgamma(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    lgamma(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    lgamma(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    lgamma(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    lgamma(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    lgamma(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    lgamma(half4 v);
+#endif
+
+extern float __attribute__((overloadable))
+    lgamma(float v, int* sign_of_gamma);
+
+extern float2 __attribute__((overloadable))
+    lgamma(float2 v, int2* sign_of_gamma);
+
+extern float3 __attribute__((overloadable))
+    lgamma(float3 v, int3* sign_of_gamma);
+
+extern float4 __attribute__((overloadable))
+    lgamma(float4 v, int4* sign_of_gamma);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    lgamma(half v, int* sign_of_gamma);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    lgamma(half2 v, int2* sign_of_gamma);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    lgamma(half3 v, int3* sign_of_gamma);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    lgamma(half4 v, int4* sign_of_gamma);
+#endif
+
+/*
+ * log: Natural logarithm
+ *
+ * Returns the natural logarithm.
+ *
+ * See also native_log().
+ */
+extern float __attribute__((const, overloadable))
+    log(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log(half4 v);
+#endif
+
+/*
+ * log10: Base 10 logarithm
+ *
+ * Returns the base 10 logarithm.
+ *
+ * See also native_log10().
+ */
+extern float __attribute__((const, overloadable))
+    log10(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log10(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log10(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log10(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log10(half4 v);
+#endif
+
+/*
+ * log1p: Natural logarithm of a value plus 1
+ *
+ * Returns the natural logarithm of (v + 1.f).
+ *
+ * See also native_log1p().
+ */
+extern float __attribute__((const, overloadable))
+    log1p(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log1p(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log1p(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log1p(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log1p(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log1p(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log1p(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log1p(half4 v);
+#endif
+
+/*
+ * log2: Base 2 logarithm
+ *
+ * Returns the base 2 logarithm.
+ *
+ * See also native_log2().
+ */
+extern float __attribute__((const, overloadable))
+    log2(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log2(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log2(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log2(half4 v);
+#endif
+
+/*
+ * logb: Base two exponent
+ *
+ * Returns the base two exponent of a value, where the mantissa is between
+ * 1.f (inclusive) and 2.f (exclusive).
+ *
+ * For example, logb(8.5f) returns 3.f.
+ *
+ * Because of the difference in mantissa, this number is one less than is returned by frexp().
+ *
+ * ilogb() is similar but returns an integer.
+ */
+extern float __attribute__((const, overloadable))
+    logb(float v);
+
+extern float2 __attribute__((const, overloadable))
+    logb(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    logb(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    logb(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    logb(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    logb(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    logb(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    logb(half4 v);
+#endif
+
+/*
+ * mad: Multiply and add
+ *
+ * Multiply and add.  Returns (multiplicand1 * multiplicand2) + offset.
+ *
+ * This function is similar to fma().  fma() retains full precision of the multiplied result
+ * and rounds only after the addition.  mad() rounds after the multiplication and the addition.
+ * In rs_fp_relaxed mode, mad() may not do the rounding after multiplicaiton.
+ */
+extern float __attribute__((const, overloadable))
+    mad(float multiplicand1, float multiplicand2, float offset);
+
+extern float2 __attribute__((const, overloadable))
+    mad(float2 multiplicand1, float2 multiplicand2, float2 offset);
+
+extern float3 __attribute__((const, overloadable))
+    mad(float3 multiplicand1, float3 multiplicand2, float3 offset);
+
+extern float4 __attribute__((const, overloadable))
+    mad(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    mad(half multiplicand1, half multiplicand2, half offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    mad(half2 multiplicand1, half2 multiplicand2, half2 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    mad(half3 multiplicand1, half3 multiplicand2, half3 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    mad(half4 multiplicand1, half4 multiplicand2, half4 offset);
+#endif
+
+/*
+ * max: Maximum
+ *
+ * Returns the maximum value of two arguments.
+ */
+extern float __attribute__((const, overloadable))
+    max(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    max(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    max(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    max(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    max(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    max(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    max(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    max(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    max(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    max(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    max(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    max(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    max(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    max(half4 a, half b);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char __attribute__((const, overloadable))
+    max(char a, char b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar __attribute__((const, overloadable))
+    max(uchar a, uchar b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short __attribute__((const, overloadable))
+    max(short a, short b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort __attribute__((const, overloadable))
+    max(ushort a, ushort b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int __attribute__((const, overloadable))
+    max(int a, int b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint __attribute__((const, overloadable))
+    max(uint a, uint b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char2 __attribute__((const, overloadable))
+    max(char2 a, char2 b) {
+    char2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar2 __attribute__((const, overloadable))
+    max(uchar2 a, uchar2 b) {
+    uchar2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short2 __attribute__((const, overloadable))
+    max(short2 a, short2 b) {
+    short2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort2 __attribute__((const, overloadable))
+    max(ushort2 a, ushort2 b) {
+    ushort2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int2 __attribute__((const, overloadable))
+    max(int2 a, int2 b) {
+    int2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint2 __attribute__((const, overloadable))
+    max(uint2 a, uint2 b) {
+    uint2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char3 __attribute__((const, overloadable))
+    max(char3 a, char3 b) {
+    char3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar3 __attribute__((const, overloadable))
+    max(uchar3 a, uchar3 b) {
+    uchar3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short3 __attribute__((const, overloadable))
+    max(short3 a, short3 b) {
+    short3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort3 __attribute__((const, overloadable))
+    max(ushort3 a, ushort3 b) {
+    ushort3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int3 __attribute__((const, overloadable))
+    max(int3 a, int3 b) {
+    int3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint3 __attribute__((const, overloadable))
+    max(uint3 a, uint3 b) {
+    uint3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char4 __attribute__((const, overloadable))
+    max(char4 a, char4 b) {
+    char4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar4 __attribute__((const, overloadable))
+    max(uchar4 a, uchar4 b) {
+    uchar4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short4 __attribute__((const, overloadable))
+    max(short4 a, short4 b) {
+    short4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort4 __attribute__((const, overloadable))
+    max(ushort4 a, ushort4 b) {
+    ushort4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int4 __attribute__((const, overloadable))
+    max(int4 a, int4 b) {
+    int4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint4 __attribute__((const, overloadable))
+    max(uint4 a, uint4 b) {
+    uint4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char __attribute__((const, overloadable))
+    max(char a, char b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    max(char2 a, char2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    max(char3 a, char3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    max(char4 a, char4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar __attribute__((const, overloadable))
+    max(uchar a, uchar b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    max(uchar2 a, uchar2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    max(uchar3 a, uchar3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    max(uchar4 a, uchar4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short __attribute__((const, overloadable))
+    max(short a, short b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    max(short2 a, short2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    max(short3 a, short3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    max(short4 a, short4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort __attribute__((const, overloadable))
+    max(ushort a, ushort b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    max(ushort2 a, ushort2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    max(ushort3 a, ushort3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    max(ushort4 a, ushort4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int __attribute__((const, overloadable))
+    max(int a, int b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    max(int2 a, int2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    max(int3 a, int3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    max(int4 a, int4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint __attribute__((const, overloadable))
+    max(uint a, uint b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    max(uint2 a, uint2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    max(uint3 a, uint3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    max(uint4 a, uint4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long __attribute__((const, overloadable))
+    max(long a, long b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    max(long2 a, long2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    max(long3 a, long3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    max(long4 a, long4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong __attribute__((const, overloadable))
+    max(ulong a, ulong b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    max(ulong2 a, ulong2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    max(ulong3 a, ulong3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    max(ulong4 a, ulong4 b);
+#endif
+
+/*
+ * min: Minimum
+ *
+ * Returns the minimum value of two arguments.
+ */
+extern float __attribute__((const, overloadable))
+    min(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    min(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    min(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    min(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    min(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    min(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    min(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    min(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    min(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    min(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    min(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    min(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    min(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    min(half4 a, half b);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char __attribute__((const, overloadable))
+    min(char a, char b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar __attribute__((const, overloadable))
+    min(uchar a, uchar b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short __attribute__((const, overloadable))
+    min(short a, short b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort __attribute__((const, overloadable))
+    min(ushort a, ushort b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int __attribute__((const, overloadable))
+    min(int a, int b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint __attribute__((const, overloadable))
+    min(uint a, uint b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char2 __attribute__((const, overloadable))
+    min(char2 a, char2 b) {
+    char2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar2 __attribute__((const, overloadable))
+    min(uchar2 a, uchar2 b) {
+    uchar2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short2 __attribute__((const, overloadable))
+    min(short2 a, short2 b) {
+    short2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort2 __attribute__((const, overloadable))
+    min(ushort2 a, ushort2 b) {
+    ushort2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int2 __attribute__((const, overloadable))
+    min(int2 a, int2 b) {
+    int2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint2 __attribute__((const, overloadable))
+    min(uint2 a, uint2 b) {
+    uint2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char3 __attribute__((const, overloadable))
+    min(char3 a, char3 b) {
+    char3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar3 __attribute__((const, overloadable))
+    min(uchar3 a, uchar3 b) {
+    uchar3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short3 __attribute__((const, overloadable))
+    min(short3 a, short3 b) {
+    short3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort3 __attribute__((const, overloadable))
+    min(ushort3 a, ushort3 b) {
+    ushort3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int3 __attribute__((const, overloadable))
+    min(int3 a, int3 b) {
+    int3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint3 __attribute__((const, overloadable))
+    min(uint3 a, uint3 b) {
+    uint3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char4 __attribute__((const, overloadable))
+    min(char4 a, char4 b) {
+    char4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar4 __attribute__((const, overloadable))
+    min(uchar4 a, uchar4 b) {
+    uchar4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short4 __attribute__((const, overloadable))
+    min(short4 a, short4 b) {
+    short4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort4 __attribute__((const, overloadable))
+    min(ushort4 a, ushort4 b) {
+    ushort4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int4 __attribute__((const, overloadable))
+    min(int4 a, int4 b) {
+    int4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint4 __attribute__((const, overloadable))
+    min(uint4 a, uint4 b) {
+    uint4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char __attribute__((const, overloadable))
+    min(char a, char b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    min(char2 a, char2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    min(char3 a, char3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    min(char4 a, char4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar __attribute__((const, overloadable))
+    min(uchar a, uchar b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    min(uchar2 a, uchar2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    min(uchar3 a, uchar3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    min(uchar4 a, uchar4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short __attribute__((const, overloadable))
+    min(short a, short b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    min(short2 a, short2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    min(short3 a, short3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    min(short4 a, short4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort __attribute__((const, overloadable))
+    min(ushort a, ushort b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    min(ushort2 a, ushort2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    min(ushort3 a, ushort3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    min(ushort4 a, ushort4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int __attribute__((const, overloadable))
+    min(int a, int b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    min(int2 a, int2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    min(int3 a, int3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    min(int4 a, int4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint __attribute__((const, overloadable))
+    min(uint a, uint b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    min(uint2 a, uint2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    min(uint3 a, uint3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    min(uint4 a, uint4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long __attribute__((const, overloadable))
+    min(long a, long b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    min(long2 a, long2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    min(long3 a, long3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    min(long4 a, long4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong __attribute__((const, overloadable))
+    min(ulong a, ulong b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    min(ulong2 a, ulong2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    min(ulong3 a, ulong3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    min(ulong4 a, ulong4 b);
+#endif
+
+/*
+ * mix: Mixes two values
+ *
+ * Returns start + ((stop - start) * fraction).
+ *
+ * This can be useful for mixing two values.  For example, to create a new color that is
+ * 40% color1 and 60% color2, use mix(color1, color2, 0.6f).
+ */
+extern float __attribute__((const, overloadable))
+    mix(float start, float stop, float fraction);
+
+extern float2 __attribute__((const, overloadable))
+    mix(float2 start, float2 stop, float2 fraction);
+
+extern float3 __attribute__((const, overloadable))
+    mix(float3 start, float3 stop, float3 fraction);
+
+extern float4 __attribute__((const, overloadable))
+    mix(float4 start, float4 stop, float4 fraction);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    mix(half start, half stop, half fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    mix(half2 start, half2 stop, half2 fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    mix(half3 start, half3 stop, half3 fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    mix(half4 start, half4 stop, half4 fraction);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    mix(float2 start, float2 stop, float fraction);
+
+extern float3 __attribute__((const, overloadable))
+    mix(float3 start, float3 stop, float fraction);
+
+extern float4 __attribute__((const, overloadable))
+    mix(float4 start, float4 stop, float fraction);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    mix(half2 start, half2 stop, half fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    mix(half3 start, half3 stop, half fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    mix(half4 start, half4 stop, half fraction);
+#endif
+
+/*
+ * modf: Integral and fractional components
+ *
+ * Returns the integral and fractional components of a number.
+ *
+ * Both components will have the same sign as x.  For example, for an input of -3.72f,
+ * *integral_part will be set to -3.f and .72f will be returned.
+ *
+ * Parameters:
+ *   v: Source value.
+ *   integral_part: *integral_part will be set to the integral portion of the number.
+ *
+ * Returns: Floating point portion of the value.
+ */
+extern float __attribute__((overloadable))
+    modf(float v, float* integral_part);
+
+extern float2 __attribute__((overloadable))
+    modf(float2 v, float2* integral_part);
+
+extern float3 __attribute__((overloadable))
+    modf(float3 v, float3* integral_part);
+
+extern float4 __attribute__((overloadable))
+    modf(float4 v, float4* integral_part);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    modf(half v, half* integral_part);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    modf(half2 v, half2* integral_part);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    modf(half3 v, half3* integral_part);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    modf(half4 v, half4* integral_part);
+#endif
+
+/*
+ * nan: Not a Number
+ *
+ * Returns a NaN value (Not a Number).
+ *
+ * Parameters:
+ *   v: Not used.
+ */
+extern float __attribute__((const, overloadable))
+    nan(uint v);
+
+/*
+ * nan_half: Not a Number
+ *
+ *  Returns a half-precision floating point NaN value (Not a Number).
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    nan_half(void);
+#endif
+
+/*
+ * native_acos: Approximate inverse cosine
+ *
+ * Returns the approximate inverse cosine, in radians.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also acos().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_acos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_acos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_acos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_acos(half4 v);
+#endif
+
+/*
+ * native_acosh: Approximate inverse hyperbolic cosine
+ *
+ * Returns the approximate inverse hyperbolic cosine, in radians.
+ *
+ * See also acosh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acosh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acosh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acosh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acosh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_acosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_acosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_acosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_acosh(half4 v);
+#endif
+
+/*
+ * native_acospi: Approximate inverse cosine divided by pi
+ *
+ * Returns the approximate inverse cosine in radians, divided by pi.
+ *
+ * To get an inverse cosine measured in degrees, use acospi(a) * 180.f.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also acospi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_acospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_acospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_acospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_acospi(half4 v);
+#endif
+
+/*
+ * native_asin: Approximate inverse sine
+ *
+ * Returns the approximate inverse sine, in radians.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also asin().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_asin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_asin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_asin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_asin(half4 v);
+#endif
+
+/*
+ * native_asinh: Approximate inverse hyperbolic sine
+ *
+ * Returns the approximate inverse hyperbolic sine, in radians.
+ *
+ * See also asinh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asinh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asinh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asinh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asinh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_asinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_asinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_asinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_asinh(half4 v);
+#endif
+
+/*
+ * native_asinpi: Approximate inverse sine divided by pi
+ *
+ * Returns the approximate inverse sine in radians, divided by pi.
+ *
+ * To get an inverse sine measured in degrees, use asinpi(a) * 180.f.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also asinpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_asinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_asinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_asinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_asinpi(half4 v);
+#endif
+
+/*
+ * native_atan: Approximate inverse tangent
+ *
+ * Returns the approximate inverse tangent, in radians.
+ *
+ * See also atan().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atan(half4 v);
+#endif
+
+/*
+ * native_atan2: Approximate inverse tangent of a ratio
+ *
+ * Returns the approximate inverse tangent of (numerator / denominator), in radians.
+ *
+ * See also atan2().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan2(float numerator, float denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan2(float2 numerator, float2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan2(float3 numerator, float3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan2(float4 numerator, float4 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atan2(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atan2(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atan2(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atan2(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * native_atan2pi: Approximate inverse tangent of a ratio, divided by pi
+ *
+ * Returns the approximate inverse tangent of (numerator / denominator),
+ * in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atan2pi(n, d) * 180.f.
+ *
+ * See also atan2pi().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan2pi(float numerator, float denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan2pi(float2 numerator, float2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan2pi(float3 numerator, float3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan2pi(float4 numerator, float4 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atan2pi(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atan2pi(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atan2pi(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atan2pi(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * native_atanh: Approximate inverse hyperbolic tangent
+ *
+ * Returns the approximate inverse hyperbolic tangent, in radians.
+ *
+ * See also atanh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atanh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atanh(half4 v);
+#endif
+
+/*
+ * native_atanpi: Approximate inverse tangent divided by pi
+ *
+ * Returns the approximate inverse tangent in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atanpi(a) * 180.f.
+ *
+ * See also atanpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atanpi(half4 v);
+#endif
+
+/*
+ * native_cbrt: Approximate cube root
+ *
+ * Returns the approximate cubic root.
+ *
+ * See also cbrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cbrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cbrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cbrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cbrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cbrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cbrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cbrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cbrt(half4 v);
+#endif
+
+/*
+ * native_cos: Approximate cosine
+ *
+ * Returns the approximate cosine of an angle measured in radians.
+ *
+ * See also cos().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cos(half4 v);
+#endif
+
+/*
+ * native_cosh: Approximate hypebolic cosine
+ *
+ * Returns the approximate hypebolic cosine.
+ *
+ * See also cosh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cosh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cosh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cosh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cosh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cosh(half4 v);
+#endif
+
+/*
+ * native_cospi: Approximate cosine of a number multiplied by pi
+ *
+ * Returns the approximate cosine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the cosine of a value measured in degrees, call cospi(v / 180.f).
+ *
+ * See also cospi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cospi(half4 v);
+#endif
+
+/*
+ * native_divide: Approximate division
+ *
+ * Computes the approximate division of two values.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_divide(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_divide(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_divide(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_divide(float4 left_vector, float4 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_divide(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_divide(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_divide(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_divide(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * native_exp: Approximate e raised to a number
+ *
+ * Fast approximate exp.
+ *
+ * It is valid for inputs from -86.f to 86.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_exp(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_exp(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_exp(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_exp(half4 v);
+#endif
+
+/*
+ * native_exp10: Approximate 10 raised to a number
+ *
+ * Fast approximate exp10.
+ *
+ * It is valid for inputs from -37.f to 37.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp10().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_exp10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_exp10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_exp10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_exp10(half4 v);
+#endif
+
+/*
+ * native_exp2: Approximate 2 raised to a number
+ *
+ * Fast approximate exp2.
+ *
+ * It is valid for inputs from -125.f to 125.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp2().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_exp2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_exp2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_exp2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_exp2(half4 v);
+#endif
+
+/*
+ * native_expm1: Approximate e raised to a number minus one
+ *
+ * Returns the approximate (e ^ v) - 1.
+ *
+ * See also expm1().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_expm1(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_expm1(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_expm1(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_expm1(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_expm1(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_expm1(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_expm1(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_expm1(half4 v);
+#endif
+
+/*
+ * native_hypot: Approximate hypotenuse
+ *
+ * Returns the approximate native_sqrt(a * a + b * b)
+ *
+ * See also hypot().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_hypot(float a, float b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_hypot(float2 a, float2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_hypot(float3 a, float3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_hypot(float4 a, float4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_hypot(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_hypot(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_hypot(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_hypot(half4 a, half4 b);
+#endif
+
+/*
+ * native_log: Approximate natural logarithm
+ *
+ * Fast approximate log.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log(half4 v);
+#endif
+
+/*
+ * native_log10: Approximate base 10 logarithm
+ *
+ * Fast approximate log10.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log10().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log10(half4 v);
+#endif
+
+/*
+ * native_log1p: Approximate natural logarithm of a value plus 1
+ *
+ * Returns the approximate natural logarithm of (v + 1.0f)
+ *
+ * See also log1p().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_log1p(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_log1p(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_log1p(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_log1p(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log1p(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log1p(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log1p(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log1p(half4 v);
+#endif
+
+/*
+ * native_log2: Approximate base 2 logarithm
+ *
+ * Fast approximate log2.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log2().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log2(half4 v);
+#endif
+
+/*
+ * native_powr: Approximate positive base raised to an exponent
+ *
+ * Fast approximate (base ^ exponent).
+ *
+ * See also powr().
+ *
+ * Parameters:
+ *   base: Must be between 0.f and 256.f.  The function is not accurate for values very close to zero.
+ *   exponent: Must be between -15.f and 15.f.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_powr(float base, float exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_powr(float2 base, float2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_powr(float3 base, float3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_powr(float4 base, float4 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_powr(half base, half exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_powr(half2 base, half2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_powr(half3 base, half3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_powr(half4 base, half4 exponent);
+#endif
+
+/*
+ * native_recip: Approximate reciprocal
+ *
+ * Returns the approximate approximate reciprocal of a value.
+ *
+ * See also half_recip().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_recip(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_recip(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_recip(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_recip(half4 v);
+#endif
+
+/*
+ * native_rootn: Approximate nth root
+ *
+ * Compute the approximate Nth root of a value.
+ *
+ * See also rootn().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_rootn(half v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_rootn(half2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_rootn(half3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_rootn(half4 v, int4 n);
+#endif
+
+/*
+ * native_rsqrt: Approximate reciprocal of a square root
+ *
+ * Returns approximate (1 / sqrt(v)).
+ *
+ * See also rsqrt(), half_rsqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_rsqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_rsqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_rsqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_rsqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_rsqrt(half4 v);
+#endif
+
+/*
+ * native_sin: Approximate sine
+ *
+ * Returns the approximate sine of an angle measured in radians.
+ *
+ * See also sin().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sin(half4 v);
+#endif
+
+/*
+ * native_sincos: Approximate sine and cosine
+ *
+ * Returns the approximate sine and cosine of a value.
+ *
+ * See also sincos().
+ *
+ * Parameters:
+ *   v: Incoming value in radians.
+ *   cos: *cos will be set to the cosine value.
+ *
+ * Returns: Sine.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((overloadable))
+    native_sincos(float v, float* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((overloadable))
+    native_sincos(float2 v, float2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((overloadable))
+    native_sincos(float3 v, float3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((overloadable))
+    native_sincos(float4 v, float4* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    native_sincos(half v, half* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    native_sincos(half2 v, half2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    native_sincos(half3 v, half3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    native_sincos(half4 v, half4* cos);
+#endif
+
+/*
+ * native_sinh: Approximate hyperbolic sine
+ *
+ * Returns the approximate hyperbolic sine of a value specified in radians.
+ *
+ * See also sinh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sinh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sinh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sinh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sinh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sinh(half4 v);
+#endif
+
+/*
+ * native_sinpi: Approximate sine of a number multiplied by pi
+ *
+ * Returns the approximate sine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the sine of a value measured in degrees, call sinpi(v / 180.f).
+ *
+ * See also sinpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sinpi(half4 v);
+#endif
+
+/*
+ * native_sqrt: Approximate square root
+ *
+ * Returns the approximate sqrt(v).
+ *
+ * See also sqrt(), half_sqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sqrt(half4 v);
+#endif
+
+/*
+ * native_tan: Approximate tangent
+ *
+ * Returns the approximate tangent of an angle measured in radians.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_tan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_tan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_tan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_tan(half4 v);
+#endif
+
+/*
+ * native_tanh: Approximate hyperbolic tangent
+ *
+ * Returns the approximate hyperbolic tangent of a value.
+ *
+ * See also tanh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tanh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_tanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_tanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_tanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_tanh(half4 v);
+#endif
+
+/*
+ * native_tanpi: Approximate tangent of a number multiplied by pi
+ *
+ * Returns the approximate tangent of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the tangent of a value measured in degrees, call tanpi(v / 180.f).
+ *
+ * See also tanpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_tanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_tanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_tanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_tanpi(half4 v);
+#endif
+
+/*
+ * nextafter: Next floating point number
+ *
+ * Returns the next representable floating point number from v towards target.
+ *
+ * In rs_fp_relaxed mode, a denormalized input value may not yield the next denormalized
+ * value, as support of denormalized values is optional in relaxed mode.
+ */
+extern float __attribute__((const, overloadable))
+    nextafter(float v, float target);
+
+extern float2 __attribute__((const, overloadable))
+    nextafter(float2 v, float2 target);
+
+extern float3 __attribute__((const, overloadable))
+    nextafter(float3 v, float3 target);
+
+extern float4 __attribute__((const, overloadable))
+    nextafter(float4 v, float4 target);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    nextafter(half v, half target);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    nextafter(half2 v, half2 target);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    nextafter(half3 v, half3 target);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    nextafter(half4 v, half4 target);
+#endif
+
+/*
+ * pow: Base raised to an exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.
+ *
+ * pown() and powr() are similar.  pown() takes an integer exponent. powr() assumes the
+ * base to be non-negative.
+ */
+extern float __attribute__((const, overloadable))
+    pow(float base, float exponent);
+
+extern float2 __attribute__((const, overloadable))
+    pow(float2 base, float2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    pow(float3 base, float3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    pow(float4 base, float4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    pow(half base, half exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    pow(half2 base, half2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    pow(half3 base, half3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    pow(half4 base, half4 exponent);
+#endif
+
+/*
+ * pown: Base raised to an integer exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.
+ *
+ * pow() and powr() are similar.  The both take a float exponent. powr() also assumes the
+ * base to be non-negative.
+ */
+extern float __attribute__((const, overloadable))
+    pown(float base, int exponent);
+
+extern float2 __attribute__((const, overloadable))
+    pown(float2 base, int2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    pown(float3 base, int3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    pown(float4 base, int4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    pown(half base, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    pown(half2 base, int2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    pown(half3 base, int3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    pown(half4 base, int4 exponent);
+#endif
+
+/*
+ * powr: Positive base raised to an exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.  base must be >= 0.
+ *
+ * pow() and pown() are similar.  They both make no assumptions about the base.
+ * pow() takes a float exponent while pown() take an integer.
+ *
+ * See also native_powr().
+ */
+extern float __attribute__((const, overloadable))
+    powr(float base, float exponent);
+
+extern float2 __attribute__((const, overloadable))
+    powr(float2 base, float2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    powr(float3 base, float3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    powr(float4 base, float4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    powr(half base, half exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    powr(half2 base, half2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    powr(half3 base, half3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    powr(half4 base, half4 exponent);
+#endif
+
+/*
+ * radians: Converts degrees into radians
+ *
+ * Converts from degrees to radians.
+ */
+extern float __attribute__((const, overloadable))
+    radians(float v);
+
+extern float2 __attribute__((const, overloadable))
+    radians(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    radians(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    radians(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    radians(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    radians(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    radians(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    radians(half4 v);
+#endif
+
+/*
+ * remainder: Remainder of a division
+ *
+ * Returns the remainder of (numerator / denominator), where the quotient is rounded towards
+ * the nearest integer.
+ *
+ * The function fmod() is similar but rounds toward the closest interger.
+ * For example, fmod(-3.8f, 2.f) returns -1.8f (-3.8f - -1.f * 2.f)
+ * while remainder(-3.8f, 2.f) returns 0.2f (-3.8f - -2.f * 2.f).
+ */
+extern float __attribute__((const, overloadable))
+    remainder(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    remainder(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    remainder(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    remainder(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    remainder(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    remainder(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    remainder(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    remainder(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * remquo: Remainder and quotient of a division
+ *
+ * Returns the quotient and the remainder of (numerator / denominator).
+ *
+ * Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * This function is useful for implementing periodic functions.  The low three bits of the
+ * quotient gives the quadrant and the remainder the distance within the quadrant.
+ * For example, an implementation of sin(x) could call remquo(x, PI / 2.f, &quadrant)
+ * to reduce very large value of x to something within a limited range.
+ *
+ * Example: remquo(-23.5f, 8.f, &quot) sets the lowest three bits of quot to 3
+ * and the sign negative.  It returns 0.5f.
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.
+ *   quotient: *quotient will be set to the integer quotient.
+ *
+ * Returns: Remainder, precise only for the low three bits.
+ */
+extern float __attribute__((overloadable))
+    remquo(float numerator, float denominator, int* quotient);
+
+extern float2 __attribute__((overloadable))
+    remquo(float2 numerator, float2 denominator, int2* quotient);
+
+extern float3 __attribute__((overloadable))
+    remquo(float3 numerator, float3 denominator, int3* quotient);
+
+extern float4 __attribute__((overloadable))
+    remquo(float4 numerator, float4 denominator, int4* quotient);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    remquo(half numerator, half denominator, int* quotient);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    remquo(half2 numerator, half2 denominator, int2* quotient);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    remquo(half3 numerator, half3 denominator, int3* quotient);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    remquo(half4 numerator, half4 denominator, int4* quotient);
+#endif
+
+/*
+ * rint: Round to even
+ *
+ * Rounds to the nearest integral value.
+ *
+ * rint() rounds half values to even.  For example, rint(0.5f) returns 0.f and
+ * rint(1.5f) returns 2.f.  Similarly, rint(-0.5f) returns -0.f and
+ * rint(-1.5f) returns -2.f.
+ *
+ * round() is similar but rounds away from zero.  trunc() truncates the decimal fraction.
+ */
+extern float __attribute__((const, overloadable))
+    rint(float v);
+
+extern float2 __attribute__((const, overloadable))
+    rint(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    rint(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    rint(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    rint(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    rint(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    rint(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    rint(half4 v);
+#endif
+
+/*
+ * rootn: Nth root
+ *
+ * Compute the Nth root of a value.
+ *
+ * See also native_rootn().
+ */
+extern float __attribute__((const, overloadable))
+    rootn(float v, int n);
+
+extern float2 __attribute__((const, overloadable))
+    rootn(float2 v, int2 n);
+
+extern float3 __attribute__((const, overloadable))
+    rootn(float3 v, int3 n);
+
+extern float4 __attribute__((const, overloadable))
+    rootn(float4 v, int4 n);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    rootn(half v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    rootn(half2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    rootn(half3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    rootn(half4 v, int4 n);
+#endif
+
+/*
+ * round: Round away from zero
+ *
+ * Round to the nearest integral value.
+ *
+ * round() rounds half values away from zero.  For example, round(0.5f) returns 1.f
+ * and round(1.5f) returns 2.f.  Similarly, round(-0.5f) returns -1.f
+ * and round(-1.5f) returns -2.f.
+ *
+ * rint() is similar but rounds half values toward even.  trunc() truncates the decimal fraction.
+ */
+extern float __attribute__((const, overloadable))
+    round(float v);
+
+extern float2 __attribute__((const, overloadable))
+    round(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    round(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    round(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    round(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    round(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    round(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    round(half4 v);
+#endif
+
+/*
+ * rsqrt: Reciprocal of a square root
+ *
+ * Returns (1 / sqrt(v)).
+ *
+ * See also half_rsqrt(), native_rsqrt().
+ */
+extern float __attribute__((const, overloadable))
+    rsqrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    rsqrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    rsqrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    rsqrt(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    rsqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    rsqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    rsqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    rsqrt(half4 v);
+#endif
+
+/*
+ * sign: Sign of a value
+ *
+ * Returns the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ */
+extern float __attribute__((const, overloadable))
+    sign(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sign(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sign(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sign(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sign(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sign(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sign(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sign(half4 v);
+#endif
+
+/*
+ * sin: Sine
+ *
+ * Returns the sine of an angle measured in radians.
+ *
+ * See also native_sin().
+ */
+extern float __attribute__((const, overloadable))
+    sin(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sin(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sin(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sin(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sin(half4 v);
+#endif
+
+/*
+ * sincos: Sine and cosine
+ *
+ * Returns the sine and cosine of a value.
+ *
+ * See also native_sincos().
+ *
+ * Parameters:
+ *   v: Incoming value in radians.
+ *   cos: *cos will be set to the cosine value.
+ *
+ * Returns: Sine of v.
+ */
+extern float __attribute__((overloadable))
+    sincos(float v, float* cos);
+
+extern float2 __attribute__((overloadable))
+    sincos(float2 v, float2* cos);
+
+extern float3 __attribute__((overloadable))
+    sincos(float3 v, float3* cos);
+
+extern float4 __attribute__((overloadable))
+    sincos(float4 v, float4* cos);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    sincos(half v, half* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    sincos(half2 v, half2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    sincos(half3 v, half3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    sincos(half4 v, half4* cos);
+#endif
+
+/*
+ * sinh: Hyperbolic sine
+ *
+ * Returns the hyperbolic sine of v, where v is measured in radians.
+ *
+ * See also native_sinh().
+ */
+extern float __attribute__((const, overloadable))
+    sinh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sinh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sinh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sinh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sinh(half4 v);
+#endif
+
+/*
+ * sinpi: Sine of a number multiplied by pi
+ *
+ * Returns the sine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the sine of a value measured in degrees, call sinpi(v / 180.f).
+ *
+ * See also native_sinpi().
+ */
+extern float __attribute__((const, overloadable))
+    sinpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sinpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sinpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sinpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sinpi(half4 v);
+#endif
+
+/*
+ * sqrt: Square root
+ *
+ * Returns the square root of a value.
+ *
+ * See also half_sqrt(), native_sqrt().
+ */
+extern float __attribute__((const, overloadable))
+    sqrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sqrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sqrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sqrt(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sqrt(half4 v);
+#endif
+
+/*
+ * step: 0 if less than a value, 0 otherwise
+ *
+ * Returns 0.f if v < edge, 1.f otherwise.
+ *
+ * This can be useful to create conditional computations without using loops and branching
+ * instructions.  For example, instead of computing (a[i] < b[i]) ? 0.f : atan2(a[i], b[i])
+ * for the corresponding elements of a vector, you could instead use step(a, b) * atan2(a, b).
+ */
+extern float __attribute__((const, overloadable))
+    step(float edge, float v);
+
+extern float2 __attribute__((const, overloadable))
+    step(float2 edge, float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    step(float3 edge, float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    step(float4 edge, float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    step(half edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    step(half2 edge, half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    step(half3 edge, half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    step(half4 edge, half4 v);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    step(float2 edge, float v);
+
+extern float3 __attribute__((const, overloadable))
+    step(float3 edge, float v);
+
+extern float4 __attribute__((const, overloadable))
+    step(float4 edge, float v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    step(half2 edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    step(half3 edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    step(half4 edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    step(float edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    step(float edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    step(float edge, float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    step(half edge, half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    step(half edge, half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    step(half edge, half4 v);
+#endif
+
+/*
+ * tan: Tangent
+ *
+ * Returns the tangent of an angle measured in radians.
+ *
+ * See also native_tan().
+ */
+extern float __attribute__((const, overloadable))
+    tan(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tan(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tan(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tan(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tan(half4 v);
+#endif
+
+/*
+ * tanh: Hyperbolic tangent
+ *
+ * Returns the hyperbolic tangent of a value.
+ *
+ * See also native_tanh().
+ */
+extern float __attribute__((const, overloadable))
+    tanh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tanh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tanh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tanh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tanh(half4 v);
+#endif
+
+/*
+ * tanpi: Tangent of a number multiplied by pi
+ *
+ * Returns the tangent of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the tangent of a value measured in degrees, call tanpi(v / 180.f).
+ *
+ * See also native_tanpi().
+ */
+extern float __attribute__((const, overloadable))
+    tanpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tanpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tanpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tanpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tanpi(half4 v);
+#endif
+
+/*
+ * tgamma: Gamma function
+ *
+ * Returns the gamma function of a value.
+ *
+ * See also lgamma().
+ */
+extern float __attribute__((const, overloadable))
+    tgamma(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tgamma(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tgamma(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tgamma(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tgamma(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tgamma(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tgamma(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tgamma(half4 v);
+#endif
+
+/*
+ * trunc: Truncates a floating point
+ *
+ * Rounds to integral using truncation.
+ *
+ * For example, trunc(1.7f) returns 1.f and trunc(-1.7f) returns -1.f.
+ *
+ * See rint() and round() for other rounding options.
+ */
+extern float __attribute__((const, overloadable))
+    trunc(float v);
+
+extern float2 __attribute__((const, overloadable))
+    trunc(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    trunc(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    trunc(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    trunc(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    trunc(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    trunc(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    trunc(half4 v);
+#endif
+
+/*
+ * rsClamp: Restrain a value to a range
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clamp a value between low and high.
+ *
+ * Parameters:
+ *   amount: Value to clamp.
+ *   low: Lower bound.
+ *   high: Upper bound.
+ */
+extern char __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(char amount, char low, char high);
+
+extern uchar __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(uchar amount, uchar low, uchar high);
+
+extern short __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(short amount, short low, short high);
+
+extern ushort __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(ushort amount, ushort low, ushort high);
+
+extern int __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(int amount, int low, int high);
+
+extern uint __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(uint amount, uint low, uint high);
+
+/*
+ * rsFrac: Returns the fractional part of a float
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the fractional part of a float
+ */
+extern float __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use fract() instead.")
+#endif
+))
+    rsFrac(float v);
+
+/*
+ * rsRand: Pseudo-random number
+ *
+ * Return a random value between 0 (or min_value) and max_malue.
+ */
+extern int __attribute__((overloadable))
+    rsRand(int max_value);
+
+extern int __attribute__((overloadable))
+    rsRand(int min_value, int max_value);
+
+extern float __attribute__((overloadable))
+    rsRand(float max_value);
+
+extern float __attribute__((overloadable))
+    rsRand(float min_value, float max_value);
+
+#endif // RENDERSCRIPT_RS_MATH_RSH

diff --git a/24.0.3/include/rs_matrix.rsh b/24.0.3/include/rs_matrix.rsh
new file mode 100644
index 0000000..9cdc27f
--- /dev/null
+++ b/24.0.3/include/rs_matrix.rsh

@@ -0,0 +1,612 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_matrix.rsh: Matrix Functions
+ *
+ * These functions let you manipulate square matrices of rank 2x2, 3x3, and 4x4.
+ * They are particularly useful for graphical transformations and are compatible
+ * with OpenGL.
+ *
+ * We use a zero-based index for rows and columns.  E.g. the last element of a
+ * rs_matrix4x4 is found at (3, 3).
+ *
+ * RenderScript uses column-major matrices and column-based vectors.  Transforming
+ * a vector is done by postmultiplying the vector, e.g. (matrix * vector),
+ * as provided by rsMatrixMultiply().
+ *
+ * To create a transformation matrix that performs two transformations at once,
+ * multiply the two source matrices, with the first transformation as the right
+ * argument.  E.g. to create a transformation matrix that applies the
+ * transformation s1 followed by s2, call rsMatrixLoadMultiply(&combined, &s2, &s1).
+ * This derives from s2 * (s1 * v), which is (s2 * s1) * v.
+ *
+ * We have two style of functions to create transformation matrices:
+ * rsMatrixLoadTransformation and rsMatrixTransformation.  The former
+ * style simply stores the transformation matrix in the first argument.  The latter
+ * modifies a pre-existing transformation matrix so that the new transformation
+ * happens first.  E.g. if you call rsMatrixTranslate() on a matrix that already
+ * does a scaling, the resulting matrix when applied to a vector will first do the
+ * translation then the scaling.
+ */
+
+#ifndef RENDERSCRIPT_RS_MATRIX_RSH
+#define RENDERSCRIPT_RS_MATRIX_RSH
+
+#include "rs_vector_math.rsh"
+
+/*
+ * rsExtractFrustumPlanes: Compute frustum planes
+ *
+ * Computes 6 frustum planes from the view projection matrix
+ *
+ * Parameters:
+ *   viewProj: Matrix to extract planes from.
+ *   left: Left plane.
+ *   right: Right plane.
+ *   top: Top plane.
+ *   bottom: Bottom plane.
+ *   near: Near plane.
+ *   far: Far plane.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsExtractFrustumPlanes(const rs_matrix4x4* viewProj, float4* left, float4* right, float4* top,
+                           float4* bottom, float4* near, float4* far) {
+    // x y z w = a b c d in the plane equation
+    left->x = viewProj->m[3] + viewProj->m[0];
+    left->y = viewProj->m[7] + viewProj->m[4];
+    left->z = viewProj->m[11] + viewProj->m[8];
+    left->w = viewProj->m[15] + viewProj->m[12];
+
+    right->x = viewProj->m[3] - viewProj->m[0];
+    right->y = viewProj->m[7] - viewProj->m[4];
+    right->z = viewProj->m[11] - viewProj->m[8];
+    right->w = viewProj->m[15] - viewProj->m[12];
+
+    top->x = viewProj->m[3] - viewProj->m[1];
+    top->y = viewProj->m[7] - viewProj->m[5];
+    top->z = viewProj->m[11] - viewProj->m[9];
+    top->w = viewProj->m[15] - viewProj->m[13];
+
+    bottom->x = viewProj->m[3] + viewProj->m[1];
+    bottom->y = viewProj->m[7] + viewProj->m[5];
+    bottom->z = viewProj->m[11] + viewProj->m[9];
+    bottom->w = viewProj->m[15] + viewProj->m[13];
+
+    near->x = viewProj->m[3] + viewProj->m[2];
+    near->y = viewProj->m[7] + viewProj->m[6];
+    near->z = viewProj->m[11] + viewProj->m[10];
+    near->w = viewProj->m[15] + viewProj->m[14];
+
+    far->x = viewProj->m[3] - viewProj->m[2];
+    far->y = viewProj->m[7] - viewProj->m[6];
+    far->z = viewProj->m[11] - viewProj->m[10];
+    far->w = viewProj->m[15] - viewProj->m[14];
+
+    float len = length(left->xyz);
+    *left /= len;
+    len = length(right->xyz);
+    *right /= len;
+    len = length(top->xyz);
+    *top /= len;
+    len = length(bottom->xyz);
+    *bottom /= len;
+    len = length(near->xyz);
+    *near /= len;
+    len = length(far->xyz);
+    *far /= len;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsExtractFrustumPlanes(const rs_matrix4x4* viewProj, float4* left, float4* righ, float4* top,
+                           float4* bottom, float4* near, float4* far);
+#endif
+
+/*
+ * rsIsSphereInFrustum: Checks if a sphere is within the frustum planes
+ *
+ * Returns true if the sphere is within the 6 frustum planes.
+ *
+ * Parameters:
+ *   sphere: float4 representing the sphere.
+ *   left: Left plane.
+ *   right: Right plane.
+ *   top: Top plane.
+ *   bottom: Bottom plane.
+ *   near: Near plane.
+ *   far: Far plane.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline bool __attribute__((always_inline, overloadable))
+    rsIsSphereInFrustum(float4* sphere, float4* left, float4* right, float4* top, float4* bottom,
+                        float4* near, float4* far) {
+    float distToCenter = dot(left->xyz, sphere->xyz) + left->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(right->xyz, sphere->xyz) + right->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(top->xyz, sphere->xyz) + top->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(bottom->xyz, sphere->xyz) + bottom->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(near->xyz, sphere->xyz) + near->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(far->xyz, sphere->xyz) + far->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    return true;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern bool __attribute__((overloadable))
+    rsIsSphereInFrustum(float4* sphere, float4* left, float4* right, float4* top, float4* bottom,
+                        float4* near, float4* far);
+#endif
+
+/*
+ * rsMatrixGet: Get one element
+ *
+ * Returns one element of a matrix.
+ *
+ * Warning: The order of the column and row parameters may be unexpected.
+ *
+ * Parameters:
+ *   m: Matrix to extract the element from.
+ *   col: Zero-based column of the element to be extracted.
+ *   row: Zero-based row of the element to extracted.
+ */
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix4x4* m, uint32_t col, uint32_t row);
+
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix3x3* m, uint32_t col, uint32_t row);
+
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix2x2* m, uint32_t col, uint32_t row);
+
+/*
+ * rsMatrixInverse: Inverts a matrix in place
+ *
+ * Returns true if the matrix was successfully inverted.
+ *
+ * Parameters:
+ *   m: Matrix to invert.
+ */
+extern bool __attribute__((overloadable))
+    rsMatrixInverse(rs_matrix4x4* m);
+
+/*
+ * rsMatrixInverseTranspose: Inverts and transpose a matrix in place
+ *
+ * The matrix is first inverted then transposed. Returns true if the matrix was
+ * successfully inverted.
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ */
+extern bool __attribute__((overloadable))
+    rsMatrixInverseTranspose(rs_matrix4x4* m);
+
+/*
+ * rsMatrixLoad: Load or copy a matrix
+ *
+ * Set the elements of a matrix from an array of floats or from another matrix.
+ *
+ * If loading from an array, the floats should be in row-major order, i.e. the element a
+ * row 0, column 0 should be first, followed by the element at
+ * row 0, column 1, etc.
+ *
+ * If loading from a matrix and the source is smaller than the destination, the rest
+ * of the destination is filled with elements of the identity matrix.  E.g.
+ * loading a rs_matrix2x2 into a rs_matrix4x4 will give:
+ *
+ * m00 m01 0.0 0.0
+ * m10 m11 0.0 0.0
+ * 0.0 0.0 1.0 0.0
+ * 0.0 0.0 0.0 1.0
+ *
+ *
+ * Parameters:
+ *   destination: Matrix to set.
+ *   array: Array of values to set the matrix to. These arrays should be 4, 9, or 16 floats long, depending on the matrix size.
+ *   source: Source matrix.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix3x3* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix2x2* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix4x4* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix3x3* destination, const rs_matrix3x3* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix2x2* destination, const rs_matrix2x2* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix3x3* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix2x2* source);
+
+/*
+ * rsMatrixLoadFrustum: Load a frustum projection matrix
+ *
+ * Constructs a frustum projection matrix, transforming the box identified by
+ * the six clipping planes left, right, bottom, top, near, far.
+ *
+ * To apply this projection to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadFrustum(rs_matrix4x4* m, float left, float right, float bottom, float top,
+                        float near, float far);
+
+/*
+ * rsMatrixLoadIdentity: Load identity matrix
+ *
+ * Set the elements of a matrix to the identity matrix.
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix4x4* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix3x3* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix2x2* m);
+
+/*
+ * rsMatrixLoadMultiply: Multiply two matrices
+ *
+ * Sets m to the matrix product of lhs * rhs.
+ *
+ * To combine two 4x4 transformaton matrices, multiply the second transformation matrix
+ * by the first transformation matrix.  E.g. to create a transformation matrix that applies
+ * the transformation s1 followed by s2, call rsMatrixLoadMultiply(&combined, &s2, &s1).
+ *
+ * Warning: Prior to version 21, storing the result back into right matrix is not supported and
+ * will result in undefined behavior.  Use rsMatrixMulitply instead.   E.g. instead of doing
+ * rsMatrixLoadMultiply (&m2r, &m2r, &m2l), use rsMatrixMultiply (&m2r, &m2l).
+ * rsMatrixLoadMultiply (&m2l, &m2r, &m2l) works as expected.
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   lhs: Left matrix of the product.
+ *   rhs: Right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix4x4* m, const rs_matrix4x4* lhs, const rs_matrix4x4* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix3x3* m, const rs_matrix3x3* lhs, const rs_matrix3x3* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix2x2* m, const rs_matrix2x2* lhs, const rs_matrix2x2* rhs);
+
+/*
+ * rsMatrixLoadOrtho: Load an orthographic projection matrix
+ *
+ * Constructs an orthographic projection matrix, transforming the box identified by the
+ * six clipping planes left, right, bottom, top, near, far into a unit cube
+ * with a corner at (-1, -1, -1) and the opposite at (1, 1, 1).
+ *
+ * To apply this projection to a vector, multiply the vector by the created matrix
+ * using rsMatrixMultiply().
+ *
+ * See https://en.wikipedia.org/wiki/Orthographic_projection .
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadOrtho(rs_matrix4x4* m, float left, float right, float bottom, float top, float near,
+                      float far);
+
+/*
+ * rsMatrixLoadPerspective: Load a perspective projection matrix
+ *
+ * Constructs a perspective projection matrix, assuming a symmetrical field of view.
+ *
+ * To apply this projection to a vector, multiply the vector by the created matrix
+ * using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   fovy: Field of view, in degrees along the Y axis.
+ *   aspect: Ratio of x / y.
+ *   near: Near clipping plane.
+ *   far: Far clipping plane.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+/*
+ * rsMatrixLoadRotate: Load a rotation matrix
+ *
+ * This function creates a rotation matrix.  The axis of rotation is the (x, y, z) vector.
+ *
+ * To rotate a vector, multiply the vector by the created matrix using rsMatrixMultiply().
+ *
+ * See http://en.wikipedia.org/wiki/Rotation_matrix .
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   rot: How much rotation to do, in degrees.
+ *   x: X component of the vector that is the axis of rotation.
+ *   y: Y component of the vector that is the axis of rotation.
+ *   z: Z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+/*
+ * rsMatrixLoadScale: Load a scaling matrix
+ *
+ * This function creates a scaling matrix, where each component of a vector is multiplied
+ * by a number.  This number can be negative.
+ *
+ * To scale a vector, multiply the vector by the created matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   x: Multiple to scale the x components by.
+ *   y: Multiple to scale the y components by.
+ *   z: Multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadScale(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixLoadTranslate: Load a translation matrix
+ *
+ * This function creates a translation matrix, where a number is added to each element of
+ * a vector.
+ *
+ * To translate a vector, multiply the vector by the created matrix using
+ * rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   x: Number to add to each x component.
+ *   y: Number to add to each y component.
+ *   z: Number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixMultiply: Multiply a matrix by a vector or another matrix
+ *
+ * For the matrix by matrix variant, sets m to the matrix product m * rhs.
+ *
+ * When combining two 4x4 transformation matrices using this function, the resulting
+ * matrix will correspond to performing the rhs transformation first followed by
+ * the original m transformation.
+ *
+ * For the matrix by vector variant, returns the post-multiplication of the vector
+ * by the matrix, ie. m * in.
+ *
+ * When multiplying a float3 to a rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a float2 to a rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a float2 to a rs_matrix3x3, the vector is expanded with (0).
+ *
+ * Starting with API 14, this function takes a const matrix as the first argument.
+ *
+ * Parameters:
+ *   m: Left matrix of the product and the matrix to be set.
+ *   rhs: Right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, const rs_matrix4x4* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, const rs_matrix3x3* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix2x2* m, const rs_matrix2x2* rhs);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float4 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float3 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float2 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, float3 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, float2 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float2 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix2x2* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float4 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix3x3* m, float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix3x3* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float2 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix2x2* m, float2 in);
+#endif
+
+/*
+ * rsMatrixRotate: Apply a rotation to a transformation matrix
+ *
+ * Multiply the matrix m with a rotation matrix.
+ *
+ * This function modifies a transformation matrix to first do a rotation.  The axis of
+ * rotation is the (x, y, z) vector.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   rot: How much rotation to do, in degrees.
+ *   x: X component of the vector that is the axis of rotation.
+ *   y: Y component of the vector that is the axis of rotation.
+ *   z: Z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+/*
+ * rsMatrixScale: Apply a scaling to a transformation matrix
+ *
+ * Multiply the matrix m with a scaling matrix.
+ *
+ * This function modifies a transformation matrix to first do a scaling.   When scaling,
+ * each component of a vector is multiplied by a number.  This number can be negative.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   x: Multiple to scale the x components by.
+ *   y: Multiple to scale the y components by.
+ *   z: Multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixScale(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixSet: Set one element
+ *
+ * Set an element of a matrix.
+ *
+ * Warning: The order of the column and row parameters may be unexpected.
+ *
+ * Parameters:
+ *   m: Matrix that will be modified.
+ *   col: Zero-based column of the element to be set.
+ *   row: Zero-based row of the element to be set.
+ *   v: Value to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix4x4* m, uint32_t col, uint32_t row, float v);
+
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix3x3* m, uint32_t col, uint32_t row, float v);
+
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix2x2* m, uint32_t col, uint32_t row, float v);
+
+/*
+ * rsMatrixTranslate: Apply a translation to a transformation matrix
+ *
+ * Multiply the matrix m with a translation matrix.
+ *
+ * This function modifies a transformation matrix to first do a translation.  When
+ * translating, a number is added to each component of a vector.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the
+ * created matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   x: Number to add to each x component.
+ *   y: Number to add to each y component.
+ *   z: Number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixTranspose: Transpose a matrix place
+ *
+ * Transpose the matrix m in place.
+ *
+ * Parameters:
+ *   m: Matrix to transpose.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix4x4* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix3x3* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix2x2* m);
+
+#endif // RENDERSCRIPT_RS_MATRIX_RSH

diff --git a/24.0.3/include/rs_object_info.rsh b/24.0.3/include/rs_object_info.rsh
new file mode 100644
index 0000000..0b18de3
--- /dev/null
+++ b/24.0.3/include/rs_object_info.rsh

@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_object_info.rsh: Object Characteristics Functions
+ *
+ * The functions below can be used to query the characteristics of an Allocation, Element,
+ * or Sampler object.  These objects are created from Java.  You can't create them from a
+ * script.
+ *
+ * Allocations:
+ *
+ * Allocations are the primary method used to pass data to and from RenderScript kernels.
+ *
+ * They are a structured collection of cells that can be used to store bitmaps, textures,
+ * arbitrary data points, etc.
+ *
+ * This collection of cells may have many dimensions (X, Y, Z, Array0, Array1, Array2, Array3),
+ * faces (for cubemaps), and level of details (for mipmapping).
+ *
+ * See the android.renderscript.Allocation for details on to create Allocations.
+ *
+ * Elements:
+ *
+ * The term "element" is used a bit ambiguously in RenderScript, as both type information
+ * for the cells of an Allocation and the instantiation of that type.  For example:
+ * - rs_element is a handle to a type specification, and
+ * - In functions like rsGetElementAt(), "element" means the instantiation of the type,
+ *     i.e. a cell of an Allocation.
+ *
+ * The functions below let you query the characteristics of the type specificiation.
+ *
+ * An Element can specify a simple data types as found in C, e.g. an integer, float, or
+ * boolean.  It can also specify a handle to a RenderScript object.  See rs_data_type for
+ * a list of basic types.
+ *
+ * Elements can specify fixed size vector (of size 2, 3, or 4) versions of the basic types.
+ * Elements can be grouped together into complex Elements, creating the equivalent of
+ * C structure definitions.
+ *
+ * Elements can also have a kind, which is semantic information used to interpret pixel
+ * data.  See rs_data_kind.
+ *
+ * When creating Allocations of common elements, you can simply use one of the many predefined
+ * Elements like F32_2.
+ *
+ * To create complex Elements, use the Element.Builder Java class.
+ *
+ * Samplers:
+ *
+ * Samplers objects define how Allocations can be read as structure within a kernel.
+ * See android.renderscript.S.
+ */
+
+#ifndef RENDERSCRIPT_RS_OBJECT_INFO_RSH
+#define RENDERSCRIPT_RS_OBJECT_INFO_RSH
+
+/*
+ * rsAllocationGetDimFaces: Presence of more than one face
+ *
+ * If the Allocation is a cubemap, this function returns 1 if there's more than one face
+ * present.  In all other cases, it returns 0.
+ *
+ * Use rsGetDimHasFaces() to get the dimension of a currently running kernel.
+ *
+ * Returns: Returns 1 if more than one face is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimFaces(rs_allocation a);
+
+/*
+ * rsAllocationGetDimLOD: Presence of levels of detail
+ *
+ * Query an Allocation for the presence of more than one Level Of Detail.  This is useful
+ * for mipmaps.
+ *
+ * Use rsGetDimLod() to get the dimension of a currently running kernel.
+ *
+ * Returns: Returns 1 if more than one LOD is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimLOD(rs_allocation a);
+
+/*
+ * rsAllocationGetDimX: Size of the X dimension
+ *
+ * Returns the size of the X dimension of the Allocation.
+ *
+ * Use rsGetDimX() to get the dimension of a currently running kernel.
+ *
+ * Returns: X dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimX(rs_allocation a);
+
+/*
+ * rsAllocationGetDimY: Size of the Y dimension
+ *
+ * Returns the size of the Y dimension of the Allocation.  If the Allocation has less
+ * than two dimensions, returns 0.
+ *
+ * Use rsGetDimY() to get the dimension of a currently running kernel.
+ *
+ * Returns: Y dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimY(rs_allocation a);
+
+/*
+ * rsAllocationGetDimZ: Size of the Z dimension
+ *
+ * Returns the size of the Z dimension of the Allocation.  If the Allocation has less
+ * than three dimensions, returns 0.
+ *
+ * Use rsGetDimZ() to get the dimension of a currently running kernel.
+ *
+ * Returns: Z dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimZ(rs_allocation a);
+
+/*
+ * rsAllocationGetElement: Get the object that describes the cell of an Allocation
+ *
+ * Get the Element object describing the type, kind, and other characteristics of a cell
+ * of an Allocation.  See the rsElement* functions below.
+ *
+ * Parameters:
+ *   a: Allocation to get data from.
+ *
+ * Returns: Element describing Allocation layout.
+ */
+extern rs_element __attribute__((overloadable))
+    rsAllocationGetElement(rs_allocation a);
+
+/*
+ * rsClearObject: Release an object
+ *
+ * Tells the run time that this handle will no longer be used to access the the related
+ * object.  If this was the last handle to that object, resource recovery may happen.
+ *
+ * After calling this function, *dst will be set to an empty handle.  See rsIsObject().
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_element* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_type* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_allocation* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_sampler* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_script* dst);
+
+/*
+ * rsIsObject: Check for an empty handle
+ *
+ * Returns true if the handle contains a non-null reference.
+ *
+ * This function does not validate that the internal pointer used in the handle
+ * points to an actual valid object; it only checks for null.
+ *
+ * This function can be used to check the Element returned by rsElementGetSubElement()
+ * or see if rsClearObject() has been called on a handle.
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_element v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_type v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_allocation v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_sampler v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_script v);
+
+/*
+ * rsElementGetBytesSize: Size of an Element
+ *
+ * Returns the size in bytes that an instantiation of this Element will occupy.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetBytesSize(rs_element e);
+#endif
+
+/*
+ * rsElementGetDataKind: Kind of an Element
+ *
+ * Returns the Element's data kind.  This is used to interpret pixel data.
+ *
+ * See rs_data_kind.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_data_kind __attribute__((overloadable))
+    rsElementGetDataKind(rs_element e);
+#endif
+
+/*
+ * rsElementGetDataType: Data type of an Element
+ *
+ * Returns the Element's base data type.  This can be a type similar to C/C++ (e.g.
+ * RS_TYPE_UNSIGNED_8), a handle (e.g. RS_TYPE_ALLOCATION and RS_TYPE_ELEMENT), or a
+ * more complex numerical type (e.g. RS_TYPE_UNSIGNED_5_6_5 and RS_TYPE_MATRIX_4X4).
+ * See rs_data_type.
+ *
+ * If the Element describes a vector, this function returns the data type of one of its items.
+ * Use rsElementGetVectorSize to get the size of the vector.
+ *
+ * If the Element describes a structure, RS_TYPE_NONE is returned.  Use the rsElementGetSub*
+ * functions to explore this complex Element.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_data_type __attribute__((overloadable))
+    rsElementGetDataType(rs_element e);
+#endif
+
+/*
+ * rsElementGetSubElement: Sub-element of a complex Element
+ *
+ * For Elements that represents a structure, this function returns the sub-element at the
+ * specified index.
+ *
+ * If the Element is not a structure or the index is greater or equal to the number of
+ * sub-elements, an invalid handle is returned.
+ *
+ * Parameters:
+ *   e: Element to query.
+ *   index: Index of the sub-element to return.
+ *
+ * Returns: Sub-element at the given index.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_element __attribute__((overloadable))
+    rsElementGetSubElement(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementArraySize: Array size of a sub-element of a complex Element
+ *
+ * For complex Elements, sub-elements can be statically sized arrays.  This function
+ * returns the array size of the sub-element at the index.  This sub-element repetition
+ * is different than fixed size vectors.
+ *
+ * Parameters:
+ *   e: Element to query.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Array size of the sub-element.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementArraySize(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementCount: Number of sub-elements
+ *
+ * Elements can be simple, such as an int or a float, or a structure with multiple
+ * sub-elements.  This function returns zero for simple Elements and the number of
+ * sub-elements for complex Elements.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *
+ * Returns: Number of sub-elements.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementCount(rs_element e);
+#endif
+
+/*
+ * rsElementGetSubElementName: Name of a sub-element
+ *
+ * For complex Elements, this function returns the name of the sub-element at the
+ * specified index.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *   name: Address of the array to store the name into.
+ *   nameLength: Length of the provided name array.
+ *
+ * Returns: Number of characters copied, excluding the null terminator.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementName(rs_element e, uint32_t index, char* name, uint32_t nameLength);
+#endif
+
+/*
+ * rsElementGetSubElementNameLength: Length of the name of a sub-element
+ *
+ * For complex Elements, this function returns the length of the name of the sub-element
+ * at the specified index.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Length of the sub-element name including the null terminator.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementNameLength(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementOffsetBytes: Offset of the instantiated sub-element
+ *
+ * This function returns the relative position of the instantiation of the specified
+ * sub-element within the instantiation of the Element.
+ *
+ * For example, if the Element describes a 32 bit float followed by a 32 bit integer,
+ * the offset return for the first will be 0 and the second 4.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Offset in bytes.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementOffsetBytes(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetVectorSize: Vector size of the Element
+ *
+ * Returns the Element's vector size.  If the Element does not represent a vector,
+ * 1 is returned.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *
+ * Returns: Length of the element vector.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetVectorSize(rs_element e);
+#endif
+
+/*
+ * rsGetAllocation: Return the Allocation for a given pointer
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the Allocation for a given pointer.  The pointer should point within a valid
+ * allocation.  The results are undefined if the pointer is not from a valid Allocation.
+ */
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("This function is deprecated and will be removed from the SDK in a future release.")
+#endif
+))
+    rsGetAllocation(const void* p);
+
+/*
+ * rsSamplerGetAnisotropy: Anisotropy of the Sampler
+ *
+ * Get the Sampler's anisotropy.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float __attribute__((overloadable))
+    rsSamplerGetAnisotropy(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetMagnification: Sampler magnification value
+ *
+ * Get the Sampler's magnification value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMagnification(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetMinification: Sampler minification value
+ *
+ * Get the Sampler's minification value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMinification(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetWrapS: Sampler wrap S value
+ *
+ * Get the Sampler's wrap S value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapS(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetWrapT: Sampler wrap T value
+ *
+ * Get the sampler's wrap T value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapT(rs_sampler s);
+#endif
+
+/*
+ * rsSetObject: For internal use.
+ *
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_element* dst, rs_element src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_type* dst, rs_type src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_allocation* dst, rs_allocation src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_sampler* dst, rs_sampler src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_script* dst, rs_script src);
+
+#endif // RENDERSCRIPT_RS_OBJECT_INFO_RSH

diff --git a/24.0.3/include/rs_object_types.rsh b/24.0.3/include/rs_object_types.rsh
new file mode 100644
index 0000000..e6511a5
--- /dev/null
+++ b/24.0.3/include/rs_object_types.rsh

@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_object_types.rsh: Object Types
+ *
+ * The types below are used to manipulate RenderScript objects like allocations, samplers,
+ * elements, and scripts.  Most of these object are created using the Java RenderScript APIs.
+ */
+
+#ifndef RENDERSCRIPT_RS_OBJECT_TYPES_RSH
+#define RENDERSCRIPT_RS_OBJECT_TYPES_RSH
+
+#define NULL ((void *)0)
+
+// Opaque handle to a RenderScript object. Do not use this directly.
+#ifndef __LP64__
+#define _RS_OBJECT_DECL \
+{\
+  const int* const p;\
+} __attribute__((packed, aligned(4)))
+#else
+#define _RS_OBJECT_DECL \
+{\
+  const long* const p;\
+  const long* const r;\
+  const long* const v1;\
+  const long* const v2;\
+}
+#endif
+
+/*
+ * rs_element: Handle to an element
+ *
+ * An opaque handle to a RenderScript element.
+ *
+ * See android.renderscript.Element.
+ */
+typedef struct rs_element _RS_OBJECT_DECL rs_element;
+
+/*
+ * rs_type: Handle to a Type
+ *
+ * An opaque handle to a RenderScript type.
+ *
+ * See android.renderscript.Type.
+ */
+typedef struct rs_type _RS_OBJECT_DECL rs_type;
+
+/*
+ * rs_allocation: Handle to an allocation
+ *
+ * An opaque handle to a RenderScript allocation.
+ *
+ * See android.renderscript.Allocation.
+ */
+typedef struct rs_allocation _RS_OBJECT_DECL rs_allocation;
+
+/*
+ * rs_sampler: Handle to a Sampler
+ *
+ * An opaque handle to a RenderScript sampler object.
+ *
+ * See android.renderscript.Sampler.
+ */
+typedef struct rs_sampler _RS_OBJECT_DECL rs_sampler;
+
+/*
+ * rs_script: Handle to a Script
+ *
+ * An opaque handle to a RenderScript script object.
+ *
+ * See android.renderscript.ScriptC.
+ */
+typedef struct rs_script _RS_OBJECT_DECL rs_script;
+
+/*
+ * rs_allocation_cubemap_face: Enum for selecting cube map faces
+ *
+ * An enum used to specify one the six faces of a cubemap.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+typedef enum {
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X = 0,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_X = 1,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Y = 2,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Y = 3,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Z = 4,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Z = 5
+} rs_allocation_cubemap_face;
+#endif
+
+/*
+ * rs_allocation_usage_type: Bitfield to specify how an allocation is used
+ *
+ * These values are ORed together to specify which usages or memory spaces are
+ * relevant to an allocation or an operation on an allocation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+typedef enum {
+    RS_ALLOCATION_USAGE_SCRIPT = 0x0001, // Allocation is bound to and accessed by scripts.
+    RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, // Allocation is used as a texture source.
+    RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010, // Deprecated.
+    RS_ALLOCATION_USAGE_IO_INPUT = 0x0020, // Allocation is used as a Surface consumer.
+    RS_ALLOCATION_USAGE_IO_OUTPUT = 0x0040, // Allocation is used as a Surface producer.
+    RS_ALLOCATION_USAGE_SHARED = 0x0080 // Allocation's backing store is shared with another object (usually a Bitmap).  Copying to or from the original source Bitmap will cause a synchronization rather than a full copy.
+} rs_allocation_usage_type;
+#endif
+
+/*
+ * rs_data_type: Element basic data type
+ *
+ * rs_data_type is used to encode the type information of a basic element.
+ *
+ * RS_TYPE_UNSIGNED_5_6_5, RS_TYPE_UNSIGNED_5_5_5_1, RS_TYPE_UNSIGNED_4_4_4_4 are for packed
+ * graphical data formats and represent vectors with per vector member sizes which are treated
+ * as a single unit for packing and alignment purposes.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_TYPE_NONE = 0, // Element is a complex type, i.e. a struct.
+    RS_TYPE_FLOAT_16 = 1, // A 16 bit floating point value.
+    RS_TYPE_FLOAT_32 = 2, // A 32 bit floating point value.
+    RS_TYPE_FLOAT_64 = 3, // A 64 bit floating point value.
+    RS_TYPE_SIGNED_8 = 4, // An 8 bit signed integer.
+    RS_TYPE_SIGNED_16 = 5, // A 16 bit signed integer.
+    RS_TYPE_SIGNED_32 = 6, // A 32 bit signed integer.
+    RS_TYPE_SIGNED_64 = 7, // A 64 bit signed integer.
+    RS_TYPE_UNSIGNED_8 = 8, // An 8 bit unsigned integer.
+    RS_TYPE_UNSIGNED_16 = 9, // A 16 bit unsigned integer.
+    RS_TYPE_UNSIGNED_32 = 10, // A 32 bit unsigned integer.
+    RS_TYPE_UNSIGNED_64 = 11, // A 64 bit unsigned integer.
+    RS_TYPE_BOOLEAN = 12, // 0 or 1 (false or true) stored in an 8 bit container.
+    RS_TYPE_UNSIGNED_5_6_5 = 13, // A 16 bit unsigned integer packing graphical data in 5, 6, and 5 bit sections.
+    RS_TYPE_UNSIGNED_5_5_5_1 = 14, // A 16 bit unsigned integer packing graphical data in 5, 5, 5, and 1 bit sections.
+    RS_TYPE_UNSIGNED_4_4_4_4 = 15, // A 16 bit unsigned integer packing graphical data in 4, 4, 4, and 4 bit sections.
+    RS_TYPE_MATRIX_4X4 = 16, // A 4x4 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_MATRIX_3X3 = 17, // A 3x3 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_MATRIX_2X2 = 18, // A 2x2 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_ELEMENT = 1000, // A handle to an Element.
+    RS_TYPE_TYPE = 1001, // A handle to a Type.
+    RS_TYPE_ALLOCATION = 1002, // A handle to an Allocation.
+    RS_TYPE_SAMPLER = 1003, // A handle to a Sampler.
+    RS_TYPE_SCRIPT = 1004, // A handle to a Script.
+    RS_TYPE_MESH = 1005, // Deprecated.
+    RS_TYPE_PROGRAM_FRAGMENT = 1006, // Deprecated.
+    RS_TYPE_PROGRAM_VERTEX = 1007, // Deprecated.
+    RS_TYPE_PROGRAM_RASTER = 1008, // Deprecated.
+    RS_TYPE_PROGRAM_STORE = 1009, // Deprecated.
+    RS_TYPE_FONT = 1010, // Deprecated.
+    RS_TYPE_INVALID = 10000
+} rs_data_type;
+#endif
+
+/*
+ * rs_data_kind: Element data kind
+ *
+ * This enumeration is primarly useful for graphical data.  It provides additional information to
+ * help interpret the rs_data_type.
+ *
+ * RS_KIND_USER indicates no special interpretation is expected.
+ *
+ * The RS_KIND_PIXEL_* values are used in conjunction with the standard data types for representing
+ * texture formats.
+ *
+ * See the Element.createPixel() method.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_KIND_USER         = 0, // No special interpretation.
+    RS_KIND_PIXEL_L      = 7, // Luminance.
+    RS_KIND_PIXEL_A      = 8, // Alpha.
+    RS_KIND_PIXEL_LA     = 9, // Luminance and Alpha.
+    RS_KIND_PIXEL_RGB    = 10, // Red, Green, Blue.
+    RS_KIND_PIXEL_RGBA   = 11, // Red, Green, Blue, and Alpha.
+    RS_KIND_PIXEL_DEPTH  = 12, // Depth for a depth texture.
+    RS_KIND_PIXEL_YUV    = 13, // Luminance and chrominance.
+    RS_KIND_INVALID      = 100
+} rs_data_kind;
+#endif
+
+/*
+ * rs_yuv_format: YUV format
+ *
+ *  Android YUV formats that can be associated with a RenderScript Type.
+ *
+ *  See android.graphics.ImageFormat for a description of each format.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+typedef enum {
+    RS_YUV_NONE = 0,
+    RS_YUV_YV12 = 0x32315659,
+    RS_YUV_NV21 = 0x11,
+    RS_YUV_420_888 = 0x23
+} rs_yuv_format;
+#endif
+
+/*
+ * rs_sampler_value: Sampler wrap T value
+ *
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_SAMPLER_NEAREST = 0,
+    RS_SAMPLER_LINEAR = 1,
+    RS_SAMPLER_LINEAR_MIP_LINEAR = 2,
+    RS_SAMPLER_WRAP = 3,
+    RS_SAMPLER_CLAMP = 4,
+    RS_SAMPLER_LINEAR_MIP_NEAREST = 5,
+    RS_SAMPLER_MIRRORED_REPEAT = 6,
+    RS_SAMPLER_INVALID = 100
+} rs_sampler_value;
+#endif
+
+#endif // RENDERSCRIPT_RS_OBJECT_TYPES_RSH

diff --git a/24.0.3/include/rs_quaternion.rsh b/24.0.3/include/rs_quaternion.rsh
new file mode 100644
index 0000000..55d33cf
--- /dev/null
+++ b/24.0.3/include/rs_quaternion.rsh

@@ -0,0 +1,374 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_quaternion.rsh: Quaternion Functions
+ *
+ * The following functions manipulate quaternions.
+ */
+
+#ifndef RENDERSCRIPT_RS_QUATERNION_RSH
+#define RENDERSCRIPT_RS_QUATERNION_RSH
+
+/*
+ * rsQuaternionAdd: Add two quaternions
+ *
+ * Adds two quaternions, i.e. *q += *rhs;
+ *
+ * Parameters:
+ *   q: Destination quaternion to add to.
+ *   rhs: Quaternion to add.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionAdd(rs_quaternion* q, const rs_quaternion* rhs) {
+    q->w += rhs->w;
+    q->x += rhs->x;
+    q->y += rhs->y;
+    q->z += rhs->z;
+}
+#endif
+
+/*
+ * rsQuaternionConjugate: Conjugate a quaternion
+ *
+ * Conjugates the quaternion.
+ *
+ * Parameters:
+ *   q: Quaternion to modify.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionConjugate(rs_quaternion* q) {
+    q->x = -q->x;
+    q->y = -q->y;
+    q->z = -q->z;
+}
+#endif
+
+/*
+ * rsQuaternionDot: Dot product of two quaternions
+ *
+ * Returns the dot product of two quaternions.
+ *
+ * Parameters:
+ *   q0: First quaternion.
+ *   q1: Second quaternion.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float __attribute__((overloadable))
+    rsQuaternionDot(const rs_quaternion* q0, const rs_quaternion* q1) {
+    return q0->w*q1->w + q0->x*q1->x + q0->y*q1->y + q0->z*q1->z;
+}
+#endif
+
+/*
+ * rsQuaternionGetMatrixUnit: Get a rotation matrix from a quaternion
+ *
+ * Computes a rotation matrix from the normalized quaternion.
+ *
+ * Parameters:
+ *   m: Resulting matrix.
+ *   q: Normalized quaternion.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionGetMatrixUnit(rs_matrix4x4* m, const rs_quaternion* q) {
+    float xx = q->x * q->x;
+    float xy = q->x * q->y;
+    float xz = q->x * q->z;
+    float xw = q->x * q->w;
+    float yy = q->y * q->y;
+    float yz = q->y * q->z;
+    float yw = q->y * q->w;
+    float zz = q->z * q->z;
+    float zw = q->z * q->w;
+
+    m->m[0]  = 1.0f - 2.0f * ( yy + zz );
+    m->m[4]  =        2.0f * ( xy - zw );
+    m->m[8]  =        2.0f * ( xz + yw );
+    m->m[1]  =        2.0f * ( xy + zw );
+    m->m[5]  = 1.0f - 2.0f * ( xx + zz );
+    m->m[9]  =        2.0f * ( yz - xw );
+    m->m[2]  =        2.0f * ( xz - yw );
+    m->m[6]  =        2.0f * ( yz + xw );
+    m->m[10] = 1.0f - 2.0f * ( xx + yy );
+    m->m[3]  = m->m[7] = m->m[11] = m->m[12] = m->m[13] = m->m[14] = 0.0f;
+    m->m[15] = 1.0f;
+}
+#endif
+
+/*
+ * rsQuaternionLoadRotateUnit: Quaternion that represents a rotation about an arbitrary unit vector
+ *
+ * Loads a quaternion that represents a rotation about an arbitrary unit vector.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   rot: Angle to rotate by, in radians.
+ *   x: X component of the vector.
+ *   y: Y component of the vector.
+ *   z: Z component of the vector.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z) {
+    rot *= (float)(M_PI / 180.0f) * 0.5f;
+    float c = cos(rot);
+    float s = sin(rot);
+
+    q->w = c;
+    q->x = x * s;
+    q->y = y * s;
+    q->z = z * s;
+}
+#endif
+
+/*
+ * rsQuaternionSet: Create a quaternion
+ *
+ * Creates a quaternion from its four components or from another quaternion.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   w: W component.
+ *   x: X component.
+ *   y: Y component.
+ *   z: Z component.
+ *   rhs: Source quaternion.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z) {
+    q->w = w;
+    q->x = x;
+    q->y = y;
+    q->z = z;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, const rs_quaternion* rhs) {
+    q->w = rhs->w;
+    q->x = rhs->x;
+    q->y = rhs->y;
+    q->z = rhs->z;
+}
+#endif
+
+/*
+ * rsQuaternionLoadRotate: Create a rotation quaternion
+ *
+ * Loads a quaternion that represents a rotation about an arbitrary vector
+ * (doesn't have to be unit)
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   rot: Angle to rotate by.
+ *   x: X component of a vector.
+ *   y: Y component of a vector.
+ *   z: Z component of a vector.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z) {
+    const float len = x*x + y*y + z*z;
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        x *= recipLen;
+        y *= recipLen;
+        z *= recipLen;
+    }
+    rsQuaternionLoadRotateUnit(q, rot, x, y, z);
+}
+#endif
+
+/*
+ * rsQuaternionNormalize: Normalize a quaternion
+ *
+ * Normalizes the quaternion.
+ *
+ * Parameters:
+ *   q: Quaternion to normalize.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionNormalize(rs_quaternion* q) {
+    const float len = rsQuaternionDot(q, q);
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        q->w *= recipLen;
+        q->x *= recipLen;
+        q->y *= recipLen;
+        q->z *= recipLen;
+    }
+}
+#endif
+
+/*
+ * rsQuaternionMultiply: Multiply a quaternion by a scalar or another quaternion
+ *
+ * Multiplies a quaternion by a scalar or by another quaternion, e.g
+ * *q = *q * scalar; or *q = *q * *rhs;.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   scalar: Scalar to multiply the quaternion by.
+ *   rhs: Quaternion to multiply the destination quaternion by.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, float scalar) {
+    q->w *= scalar;
+    q->x *= scalar;
+    q->y *= scalar;
+    q->z *= scalar;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, const rs_quaternion* rhs) {
+    rs_quaternion qtmp;
+    rsQuaternionSet(&qtmp, q);
+
+    q->w = qtmp.w*rhs->w - qtmp.x*rhs->x - qtmp.y*rhs->y - qtmp.z*rhs->z;
+    q->x = qtmp.w*rhs->x + qtmp.x*rhs->w + qtmp.y*rhs->z - qtmp.z*rhs->y;
+    q->y = qtmp.w*rhs->y + qtmp.y*rhs->w + qtmp.z*rhs->x - qtmp.x*rhs->z;
+    q->z = qtmp.w*rhs->z + qtmp.z*rhs->w + qtmp.x*rhs->y - qtmp.y*rhs->x;
+    rsQuaternionNormalize(q);
+}
+#endif
+
+/*
+ * rsQuaternionSlerp: Spherical linear interpolation between two quaternions
+ *
+ * Performs spherical linear interpolation between two quaternions.
+ *
+ * Parameters:
+ *   q: Result quaternion from the interpolation.
+ *   q0: First input quaternion.
+ *   q1: Second input quaternion.
+ *   t: How much to interpolate by.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionSlerp(rs_quaternion* q, const rs_quaternion* q0, const rs_quaternion* q1, float t) {
+    if (t <= 0.0f) {
+        rsQuaternionSet(q, q0);
+        return;
+    }
+    if (t >= 1.0f) {
+        rsQuaternionSet(q, q1);
+        return;
+    }
+
+    rs_quaternion tempq0, tempq1;
+    rsQuaternionSet(&tempq0, q0);
+    rsQuaternionSet(&tempq1, q1);
+
+    float angle = rsQuaternionDot(q0, q1);
+    if (angle < 0) {
+        rsQuaternionMultiply(&tempq0, -1.0f);
+        angle *= -1.0f;
+    }
+
+    float scale, invScale;
+    if (angle + 1.0f > 0.05f) {
+        if (1.0f - angle >= 0.05f) {
+            float theta = acos(angle);
+            float invSinTheta = 1.0f / sin(theta);
+            scale = sin(theta * (1.0f - t)) * invSinTheta;
+            invScale = sin(theta * t) * invSinTheta;
+        } else {
+            scale = 1.0f - t;
+            invScale = t;
+        }
+    } else {
+        rsQuaternionSet(&tempq1, tempq0.z, -tempq0.y, tempq0.x, -tempq0.w);
+        scale = sin(M_PI * (0.5f - t));
+        invScale = sin(M_PI * t);
+    }
+
+    rsQuaternionSet(q, tempq0.w*scale + tempq1.w*invScale, tempq0.x*scale + tempq1.x*invScale,
+                        tempq0.y*scale + tempq1.y*invScale, tempq0.z*scale + tempq1.z*invScale);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionAdd(rs_quaternion* q, const rs_quaternion* rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionConjugate(rs_quaternion* q);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float __attribute__((overloadable))
+    rsQuaternionDot(const rs_quaternion* q0, const rs_quaternion* q1);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionGetMatrixUnit(rs_matrix4x4* m, const rs_quaternion* q);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, const rs_quaternion* rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionNormalize(rs_quaternion* q);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, float scalar);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, const rs_quaternion* rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionSlerp(rs_quaternion* q, const rs_quaternion* q0, const rs_quaternion* q1, float t);
+#endif
+
+#endif // RENDERSCRIPT_RS_QUATERNION_RSH

diff --git a/24.0.3/include/rs_time.rsh b/24.0.3/include/rs_time.rsh
new file mode 100644
index 0000000..6c0eeb0
--- /dev/null
+++ b/24.0.3/include/rs_time.rsh

@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_time.rsh: Time Functions and Types
+ *
+ * The functions below can be used to tell the current clock time and the current
+ * system up time.  It is not recommended to call these functions inside of a kernel.
+ */
+
+#ifndef RENDERSCRIPT_RS_TIME_RSH
+#define RENDERSCRIPT_RS_TIME_RSH
+
+/*
+ * rs_time_t: Seconds since January 1, 1970
+ *
+ * Calendar time interpreted as seconds elapsed since the Epoch (00:00:00 on
+ * January 1, 1970, Coordinated Universal Time (UTC)).
+ */
+#ifndef __LP64__
+typedef int rs_time_t;
+#endif
+
+#ifdef __LP64__
+typedef long rs_time_t;
+#endif
+
+/*
+ * rs_tm: Date and time structure
+ *
+ * Data structure for broken-down time components.
+ */
+typedef struct {
+    int tm_sec; // Seconds after the minute. This ranges from 0 to 59, but possibly up to 60 for leap seconds.
+    int tm_min; // Minutes after the hour. This ranges from 0 to 59.
+    int tm_hour; // Hours past midnight. This ranges from 0 to 23.
+    int tm_mday; // Day of the month. This ranges from 1 to 31.
+    int tm_mon; // Months since January. This ranges from 0 to 11.
+    int tm_year; // Years since 1900.
+    int tm_wday; // Days since Sunday. This ranges from 0 to 6.
+    int tm_yday; // Days since January 1. This ranges from 0 to 365.
+    int tm_isdst; // Flag to indicate whether daylight saving time is in effect. The value is positive if it is in effect, zero if it is not, and negative if the information is not available.
+} rs_tm;
+
+/*
+ * rsGetDt: Elapsed time since last call
+ *
+ * Returns the time in seconds since this function was last called in this script.
+ *
+ * Returns: Time in seconds.
+ */
+extern float __attribute__((overloadable))
+    rsGetDt(void);
+
+/*
+ * rsLocaltime: Convert to local time
+ *
+ * Converts the time specified by timer into a rs_tm structure that provides year, month,
+ * hour, etc.  This value is stored at *local.
+ *
+ * This functions returns the same pointer that is passed as first argument.  If the
+ * local parameter is NULL, this function does nothing and returns NULL.
+ *
+ * Parameters:
+ *   local: Pointer to time structure where the local time will be stored.
+ *   timer: Input time as a number of seconds since January 1, 1970.
+ *
+ * Returns: Pointer to the output local time, i.e. the same value as the parameter local.
+ */
+extern rs_tm* __attribute__((overloadable))
+    rsLocaltime(rs_tm* local, const rs_time_t* timer);
+
+/*
+ * rsTime: Seconds since January 1, 1970
+ *
+ * Returns the number of seconds since the Epoch (00:00:00 UTC, January 1, 1970).
+ *
+ * If timer is non-NULL, the result is also stored in the memory pointed to by
+ * this variable.
+ *
+ * Parameters:
+ *   timer: Location to also store the returned calendar time.
+ *
+ * Returns: Seconds since the Epoch, -1 if there's an error.
+ */
+extern rs_time_t __attribute__((overloadable))
+    rsTime(rs_time_t* timer);
+
+/*
+ * rsUptimeMillis: System uptime in milliseconds
+ *
+ * Returns the current system clock (uptime) in milliseconds.
+ *
+ * Returns: Uptime in milliseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeMillis(void);
+
+/*
+ * rsUptimeNanos: System uptime in nanoseconds
+ *
+ * Returns the current system clock (uptime) in nanoseconds.
+ *
+ * The granularity of the values return by this call may be much larger than a nanosecond.
+ *
+ * Returns: Uptime in nanoseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeNanos(void);
+
+#endif // RENDERSCRIPT_RS_TIME_RSH

diff --git a/24.0.3/include/rs_value_types.rsh b/24.0.3/include/rs_value_types.rsh
new file mode 100644
index 0000000..180b297
--- /dev/null
+++ b/24.0.3/include/rs_value_types.rsh

@@ -0,0 +1,543 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_value_types.rsh: Numerical Types
+ *
+ * Scalars:
+ *
+ * RenderScript supports the following scalar numerical types:
+ *
+ *                    8 bits           16 bits            32 bits          64 bits
+ * Integer:           char, int8_t     short, int16_t     int32_t          long, long long, int64_t
+ * Unsigned integer:  uchar, uint8_t   ushort, uint16_t   uint, uint32_t   ulong, uint64_t
+ * Floating point:                     half               float            double
+ *
+ *
+ * Vectors:
+ *
+ * RenderScript supports fixed size vectors of length 2, 3, and 4.
+ * Vectors are declared using the common type name followed by a 2, 3, or 4.
+ * E.g. float4, int3, double2, ulong4.
+ *
+ * To create vector literals, use the vector type followed by the values enclosed
+ * between curly braces, e.g. (float3){1.0f, 2.0f, 3.0f}.
+ *
+ * Entries of a vector can be accessed using different naming styles.
+ *
+ * Single entries can be accessed by following the variable name with a dot and:
+ * - The letters x, y, z, and w,
+ * - The letters r, g, b, and a,
+ * - The letter s or S, followed by a zero based index.
+ *
+ * For example, with int4 myVar; the following are equivalent:
+ *   myVar.x == myVar.r == myVar.s0 == myVar.S0
+ *   myVar.y == myVar.g == myVar.s1 == myVar.S1
+ *   myVar.z == myVar.b == myVar.s2 == myVar.S2
+ *   myVar.w == myVar.a == myVar.s3 == myVar.S3
+ *
+ * Multiple entries of a vector can be accessed at once by using an identifier that is
+ * the concatenation of multiple letters or indices.  The resulting vector has a size
+ * equal to the number of entries named.
+ *
+ * With the example above, the middle two entries can be accessed using
+ * myVar.yz, myVar.gb, myVar.s12, and myVar.S12.
+ *
+ * The entries don't have to be contiguous or in increasing order.  Entries can even be
+ * repeated, as long as we're not trying to assign to it.  You also can't mix the naming
+ * styles.
+ *
+ * Here are examples of what can or can't be done:
+ * float4 v4;
+ * float3 v3;
+ * float2 v2;
+ * v2 = v4.xx; // Valid
+ * v3 = v4.zxw; // Valid
+ * v3 = v4.bba; // Valid
+ * v3 = v4.s032; // Valid
+ * v3.s120 = v4.S233; // Valid
+ * v4.yz = v3.rg; // Valid
+ * v4.yzx = v3.rg; // Invalid: mismatched sizes
+ * v4.yzz = v3; // Invalid: z appears twice in an assignment
+ * v3 = v3.xas0; // Invalid: can't mix xyzw with rgba nor s0...
+ * v3 = v4.s034; // Invalid: the digit can only be 0, 1, 2, or 3
+ *
+ *
+ * Matrices and Quaternions:
+ *
+ * RenderScript supports fixed size square matrices of floats of size 2x2, 3x3, and 4x4.
+ * The types are named rs_matrix2x2, rs_matrix3x3, and rs_matrix4x4.  See
+ * Matrix Functions for the list of operations.
+ *
+ * Quaternions are also supported via rs_quaternion.  See Quaterion Functions for the list
+ * of operations.
+ */
+
+#ifndef RENDERSCRIPT_RS_VALUE_TYPES_RSH
+#define RENDERSCRIPT_RS_VALUE_TYPES_RSH
+
+/*
+ * half: 16 bit floating point value
+ *
+ * A 16 bit floating point value.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef __fp16 half;
+#endif
+
+/*
+ * half2: Two 16 bit floats
+ *
+ * Vector version of the half float type. Provides two half fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(2))) half2;
+#endif
+
+/*
+ * half3: Three 16 bit floats
+ *
+ * Vector version of the half float type. Provides three half fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(3))) half3;
+#endif
+
+/*
+ * half4: Four 16 bit floats
+ *
+ * Vector version of the half float type. Provides four half fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(4))) half4;
+#endif
+
+/*
+ * int8_t: 8 bit signed integer
+ *
+ * 8 bit signed integer type.
+ */
+typedef char int8_t;
+
+/*
+ * int16_t: 16 bit signed integer
+ *
+ * A 16 bit signed integer type.
+ */
+typedef short int16_t;
+
+/*
+ * int32_t: 32 bit signed integer
+ *
+ * A 32 bit signed integer type.
+ */
+typedef int int32_t;
+
+/*
+ * int64_t: 64 bit signed integer
+ *
+ * A 64 bit signed integer type.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+typedef long long int64_t;
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+typedef long int64_t;
+#endif
+
+/*
+ * uint8_t: 8 bit unsigned integer
+ *
+ * 8 bit unsigned integer type.
+ */
+typedef unsigned char uint8_t;
+
+/*
+ * uint16_t: 16 bit unsigned integer
+ *
+ * A 16 bit unsigned integer type.
+ */
+typedef unsigned short uint16_t;
+
+/*
+ * uint32_t: 32 bit unsigned integer
+ *
+ * A 32 bit unsigned integer type.
+ */
+typedef unsigned int uint32_t;
+
+/*
+ * uint64_t: 64 bit unsigned integer
+ *
+ * A 64 bit unsigned integer type.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+typedef unsigned long long uint64_t;
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+typedef unsigned long uint64_t;
+#endif
+
+/*
+ * uchar: 8 bit unsigned integer
+ *
+ * 8 bit unsigned integer type.
+ */
+typedef uint8_t uchar;
+
+/*
+ * ushort: 16 bit unsigned integer
+ *
+ * A 16 bit unsigned integer type.
+ */
+typedef uint16_t ushort;
+
+/*
+ * uint: 32 bit unsigned integer
+ *
+ * A 32 bit unsigned integer type.
+ */
+typedef uint32_t uint;
+
+/*
+ * ulong: 64 bit unsigned integer
+ *
+ * A 64 bit unsigned integer type.
+ */
+typedef uint64_t ulong;
+
+/*
+ * size_t: Unsigned size type
+ *
+ * Unsigned size type.  The number of bits depend on the compilation flags.
+ */
+#ifdef __LP64__
+typedef uint64_t size_t;
+#endif
+
+#ifndef __LP64__
+typedef uint32_t size_t;
+#endif
+
+/*
+ * ssize_t: Signed size type
+ *
+ * Signed size type.  The number of bits depend on the compilation flags.
+ */
+#ifdef __LP64__
+typedef int64_t ssize_t;
+#endif
+
+#ifndef __LP64__
+typedef int32_t ssize_t;
+#endif
+
+/*
+ * float2: Two 32 bit floats
+ *
+ * A vector of two floats.  These two floats are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ *
+ * A vector of two floats.  These two floats are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(2))) float2;
+
+/*
+ * float3: Three 32 bit floats
+ *
+ * A vector of three floats.  These three floats are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(3))) float3;
+
+/*
+ * float4: Four 32 bit floats
+ *
+ * A vector of four floats type.  These four floats are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(4))) float4;
+
+/*
+ * double2: Two 64 bit floats
+ *
+ * A vector of two doubles.  These two double fields packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(2))) double2;
+
+/*
+ * double3: Three 64 bit floats
+ *
+ * A vector of three doubles.  These three double fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(3))) double3;
+
+/*
+ * double4: Four 64 bit floats
+ *
+ * A vector of four doubles.  These four double fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(4))) double4;
+
+/*
+ * uchar2: Two 8 bit unsigned integers
+ *
+ * A vector of two uchars.  These two uchar fields packed into a single 16 bit field
+ * with a 16 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(2))) uchar2;
+
+/*
+ * uchar3: Three 8 bit unsigned integers
+ *
+ * A vector of three uchars.  These three uchar fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(3))) uchar3;
+
+/*
+ * uchar4: Four 8 bit unsigned integers
+ *
+ * A vector of four uchars.  These four uchar fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(4))) uchar4;
+
+/*
+ * ushort2: Two 16 bit unsigned integers
+ *
+ * A vector of two ushorts.  These two ushort fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(2))) ushort2;
+
+/*
+ * ushort3: Three 16 bit unsigned integers
+ *
+ * A vector of three ushorts.  These three ushort fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(3))) ushort3;
+
+/*
+ * ushort4: Four 16 bit unsigned integers
+ *
+ * A vector of four ushorts.  These four ushort fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(4))) ushort4;
+
+/*
+ * uint2: Two 32 bit unsigned integers
+ *
+ * A vector of two uints.  These two uints are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(2))) uint2;
+
+/*
+ * uint3: Three 32 bit unsigned integers
+ *
+ * A vector of three uints.  These three uints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(3))) uint3;
+
+/*
+ * uint4: Four 32 bit unsigned integers
+ *
+ * A vector of four uints.  These four uints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(4))) uint4;
+
+/*
+ * ulong2: Two 64 bit unsigned integers
+ *
+ * A vector of two ulongs.  These two ulongs are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(2))) ulong2;
+
+/*
+ * ulong3: Three 64 bit unsigned integers
+ *
+ * A vector of three ulongs.  These three ulong fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(3))) ulong3;
+
+/*
+ * ulong4: Four 64 bit unsigned integers
+ *
+ * A vector of four ulongs.  These four ulong fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(4))) ulong4;
+
+/*
+ * char2: Two 8 bit signed integers
+ *
+ * A vector of two chars.  These two chars are packed into a single 16 bit field
+ * with a 16 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(2))) char2;
+
+/*
+ * char3: Three 8 bit signed integers
+ *
+ * A vector of three chars.  These three chars are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(3))) char3;
+
+/*
+ * char4: Four 8 bit signed integers
+ *
+ * A vector of four chars.  These four chars are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(4))) char4;
+
+/*
+ * short2: Two 16 bit signed integers
+ *
+ * A vector of two shorts.  These two shorts are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(2))) short2;
+
+/*
+ * short3: Three 16 bit signed integers
+ *
+ * A vector of three shorts.  These three short fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(3))) short3;
+
+/*
+ * short4: Four 16 bit signed integers
+ *
+ * A vector of four shorts.  These four short fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(4))) short4;
+
+/*
+ * int2: Two 32 bit signed integers
+ *
+ * A vector of two ints.  These two ints are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(2))) int2;
+
+/*
+ * int3: Three 32 bit signed integers
+ *
+ * A vector of three ints.  These three ints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(3))) int3;
+
+/*
+ * int4: Four 32 bit signed integers
+ *
+ * A vector of four ints.  These two fours are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(4))) int4;
+
+/*
+ * long2: Two 64 bit signed integers
+ *
+ * A vector of two longs.  These two longs are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(2))) long2;
+
+/*
+ * long3: Three 64 bit signed integers
+ *
+ * A vector of three longs.  These three longs are packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(3))) long3;
+
+/*
+ * long4: Four 64 bit signed integers
+ *
+ * A vector of four longs.  These four longs are packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(4))) long4;
+
+/*
+ * rs_matrix2x2: 2x2 matrix of 32 bit floats
+ *
+ * A square 2x2 matrix of floats.  The entries are stored in the array at the
+ * location [row*2 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[4];
+} rs_matrix2x2;
+
+/*
+ * rs_matrix3x3: 3x3 matrix of 32 bit floats
+ *
+ * A square 3x3 matrix of floats.  The entries are stored in the array at the
+ * location [row*3 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[9];
+} rs_matrix3x3;
+
+/*
+ * rs_matrix4x4: 4x4 matrix of 32 bit floats
+ *
+ * A square 4x4 matrix of floats.  The entries are stored in the array at the
+ * location [row*4 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[16];
+} rs_matrix4x4;
+
+/*
+ * rs_quaternion: Quaternion
+ *
+ * A square 4x4 matrix of floats that represents a quaternion.
+ *
+ * See Quaternion Functions.
+ */
+typedef float4 rs_quaternion;
+
+#endif // RENDERSCRIPT_RS_VALUE_TYPES_RSH

diff --git a/24.0.3/include/rs_vector_math.rsh b/24.0.3/include/rs_vector_math.rsh
new file mode 100644
index 0000000..2f5e8e7
--- /dev/null
+++ b/24.0.3/include/rs_vector_math.rsh

@@ -0,0 +1,453 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_vector_math.rsh: Vector Math Functions
+ *
+ * These functions interpret the input arguments as representation of vectors in
+ * n-dimensional space.
+ *
+ * The precision of the mathematical operations on 32 bit floats is affected by the pragmas
+ * rs_fp_relaxed and rs_fp_full.  See Mathematical Constants and Functions for details.
+ *
+ * Different precision/speed tradeoffs can be achieved by using variants of the common math
+ * functions.  Functions with a name starting with
+ * - native_: May have custom hardware implementations with weaker precision.  Additionally,
+ *   subnormal values may be flushed to zero, rounding towards zero may be used, and NaN and
+ *   infinity input may not be handled correctly.
+ * - fast_: May perform internal computations using 16 bit floats.  Additionally, subnormal
+ *   values may be flushed to zero, and rounding towards zero may be used.
+ *
+ */
+
+#ifndef RENDERSCRIPT_RS_VECTOR_MATH_RSH
+#define RENDERSCRIPT_RS_VECTOR_MATH_RSH
+
+/*
+ * cross: Cross product of two vectors
+ *
+ * Computes the cross product of two vectors.
+ */
+extern float3 __attribute__((const, overloadable))
+    cross(float3 left_vector, float3 right_vector);
+
+extern float4 __attribute__((const, overloadable))
+    cross(float4 left_vector, float4 right_vector);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cross(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cross(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * distance: Distance between two points
+ *
+ * Compute the distance between two points.
+ *
+ * See also fast_distance(), native_distance().
+ */
+extern float __attribute__((const, overloadable))
+    distance(float left_vector, float right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float2 left_vector, float2 right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float3 left_vector, float3 right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float4 left_vector, float4 right_vector);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * dot: Dot product of two vectors
+ *
+ * Computes the dot product of two vectors.
+ */
+extern float __attribute__((const, overloadable))
+    dot(float left_vector, float right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float2 left_vector, float2 right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float3 left_vector, float3 right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float4 left_vector, float4 right_vector);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * fast_distance: Approximate distance between two points
+ *
+ * Computes the approximate distance between two points.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also distance(), native_distance().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float4 left_vector, float4 right_vector);
+#endif
+
+/*
+ * fast_length: Approximate length of a vector
+ *
+ * Computes the approximate length of a vector.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also length(), native_length().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float4 v);
+#endif
+
+/*
+ * fast_normalize: Approximate normalized vector
+ *
+ * Approximately normalizes a vector.
+ *
+ * For vectors of size 1, returns -1.f for negative values, 0.f for null values, and 1.f for
+ * positive values.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also normalize(), native_normalize().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    fast_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    fast_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    fast_normalize(float4 v);
+#endif
+
+/*
+ * length: Length of a vector
+ *
+ * Computes the length of a vector.
+ *
+ * See also fast_length(), native_length().
+ */
+extern float __attribute__((const, overloadable))
+    length(float v);
+
+extern float __attribute__((const, overloadable))
+    length(float2 v);
+
+extern float __attribute__((const, overloadable))
+    length(float3 v);
+
+extern float __attribute__((const, overloadable))
+    length(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half4 v);
+#endif
+
+/*
+ * native_distance: Approximate distance between two points
+ *
+ * Computes the approximate distance between two points.
+ *
+ * See also distance(), fast_distance().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float4 left_vector, float4 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * native_length: Approximate length of a vector
+ *
+ * Compute the approximate length of a vector.
+ *
+ * See also length(), fast_length().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half4 v);
+#endif
+
+/*
+ * native_normalize: Approximately normalize a vector
+ *
+ * Approximately normalizes a vector.
+ *
+ * See also normalize(), fast_normalize().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_normalize(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_normalize(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_normalize(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_normalize(half4 v);
+#endif
+
+/*
+ * normalize: Normalize a vector
+ *
+ * Normalize a vector.
+ *
+ * For vectors of size 1, returns -1.f for negative values, 0.f for null values, and 1.f for
+ * positive values.
+ *
+ * See also fast_normalize(), native_normalize().
+ */
+extern float __attribute__((const, overloadable))
+    normalize(float v);
+
+extern float2 __attribute__((const, overloadable))
+    normalize(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    normalize(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    normalize(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    normalize(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    normalize(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    normalize(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    normalize(half4 v);
+#endif
+
+#endif // RENDERSCRIPT_RS_VECTOR_MATH_RSH

diff --git a/24.0.3/lib64/libLLVM.so b/24.0.3/lib64/libLLVM.so
new file mode 100755
index 0000000..54f7081
--- /dev/null
+++ b/24.0.3/lib64/libLLVM.so
Binary files differ

diff --git a/24.0.3/lib64/libc++.so b/24.0.3/lib64/libc++.so
new file mode 100755
index 0000000..fa84063
--- /dev/null
+++ b/24.0.3/lib64/libc++.so
Binary files differ

diff --git a/24.0.3/lib64/libclang.so b/24.0.3/lib64/libclang.so
new file mode 100755
index 0000000..3a59df1
--- /dev/null
+++ b/24.0.3/lib64/libclang.so
Binary files differ

diff --git a/25.0.2/bin/llvm-rs-cc b/25.0.2/bin/llvm-rs-cc
new file mode 100755
index 0000000..0550c16
--- /dev/null
+++ b/25.0.2/bin/llvm-rs-cc
Binary files differ

diff --git a/25.0.2/clang-include/CMakeLists.txt b/25.0.2/clang-include/CMakeLists.txt
new file mode 100644
index 0000000..9393f69
--- /dev/null
+++ b/25.0.2/clang-include/CMakeLists.txt

@@ -0,0 +1,115 @@
+set(files
+  adxintrin.h
+  altivec.h
+  ammintrin.h
+  arm_acle.h
+  avx2intrin.h
+  avx512bwintrin.h
+  avx512cdintrin.h
+  avx512erintrin.h
+  avx512fintrin.h
+  avx512vlbwintrin.h
+  avx512vlintrin.h
+  avx512dqintrin.h
+  avx512vldqintrin.h
+  avxintrin.h
+  bmi2intrin.h
+  bmiintrin.h
+  __clang_cuda_runtime_wrapper.h
+  cpuid.h
+  cuda_builtin_vars.h
+  emmintrin.h
+  f16cintrin.h
+  float.h
+  fma4intrin.h
+  fmaintrin.h
+  fxsrintrin.h
+  htmintrin.h
+  htmxlintrin.h
+  ia32intrin.h
+  immintrin.h
+  Intrin.h
+  inttypes.h
+  iso646.h
+  limits.h
+  lzcntintrin.h
+  mm3dnow.h
+  mmintrin.h
+  mm_malloc.h
+  module.modulemap
+  nmmintrin.h
+  pmmintrin.h
+  popcntintrin.h
+  prfchwintrin.h
+  rdseedintrin.h
+  rtmintrin.h
+  s390intrin.h
+  shaintrin.h
+  smmintrin.h
+  stdalign.h
+  stdarg.h
+  stdatomic.h
+  stdbool.h
+  stddef.h
+  __stddef_max_align_t.h
+  stdint.h
+  stdnoreturn.h
+  tbmintrin.h
+  tgmath.h
+  tmmintrin.h
+  unwind.h
+  vadefs.h
+  varargs.h
+  vecintrin.h
+  __wmmintrin_aes.h
+  wmmintrin.h
+  __wmmintrin_pclmul.h
+  x86intrin.h
+  xmmintrin.h
+  xopintrin.h
+  xsaveintrin.h
+  xsaveoptintrin.h
+  xsavecintrin.h
+  xsavesintrin.h
+  xtestintrin.h
+  )
+
+set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
+
+# Generate arm_neon.h
+clang_tablegen(arm_neon.h -gen-arm-neon
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+
+set(out_files)
+foreach( f ${files} )
+  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
+  set( dst ${output_dir}/${f} )
+  add_custom_command(OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+    COMMENT "Copying clang's ${f}...")
+  list(APPEND out_files ${dst})
+endforeach( f )
+
+add_custom_command(OUTPUT ${output_dir}/arm_neon.h 
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
+  COMMENT "Copying clang's arm_neon.h...")
+list(APPEND out_files ${output_dir}/arm_neon.h)
+
+add_custom_target(clang-headers ALL DEPENDS ${out_files})
+set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
+
+install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h
+  COMPONENT clang-headers
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+
+if (NOT CMAKE_CONFIGURATION_TYPES) # don't add this for IDE's.
+  add_custom_target(install-clang-headers
+    DEPENDS
+    COMMAND "${CMAKE_COMMAND}"
+            -DCMAKE_INSTALL_COMPONENT=clang-headers
+            -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+endif()

diff --git a/25.0.2/clang-include/Intrin.h b/25.0.2/clang-include/Intrin.h
new file mode 100644
index 0000000..6c1d0d1
--- /dev/null
+++ b/25.0.2/clang-include/Intrin.h

@@ -0,0 +1,958 @@
+/* ===-------- Intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <Intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+float _m_to_float(__m64);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+void __debugbreak(void);
+__int64 __emul(int, int);
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+static __inline__
+unsigned int __popcnt(unsigned int);
+static __inline__
+unsigned short __popcnt16(unsigned short);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned long __readfsdword(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+unsigned long __cdecl _byteswap_ulong(unsigned long);
+unsigned short __cdecl _byteswap_ushort(unsigned short);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+static __inline__
+long _InterlockedAnd(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedAnd16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedAnd8(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+static __inline__
+long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
+                                         long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+static __inline__
+short _InterlockedCompareExchange16(short volatile *_Destination,
+                                    short _Exchange, short _Comparand);
+static __inline__
+__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
+                                      __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+static __inline__
+char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
+                                  char _Comparand);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+static __inline__
+long __cdecl _InterlockedDecrement(long volatile *_Addend);
+static __inline__
+short _InterlockedDecrement16(short volatile *_Addend);
+long _InterlockedExchange(long volatile *_Target, long _Value);
+static __inline__
+short _InterlockedExchange16(short volatile *_Target, short _Value);
+static __inline__
+char _InterlockedExchange8(char volatile *_Target, char _Value);
+static __inline__
+long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+static __inline__
+short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+static __inline__
+char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
+static __inline__
+long __cdecl _InterlockedIncrement(long volatile *_Addend);
+static __inline__
+short _InterlockedIncrement16(short volatile *_Addend);
+static __inline__
+long _InterlockedOr(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedOr16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedOr8(char volatile *_Value, char _Mask);
+static __inline__
+long _InterlockedXor(long volatile *_Value, long _Mask);
+static __inline__
+short _InterlockedXor16(short volatile *_Value, short _Mask);
+static __inline__
+char _InterlockedXor8(char volatile *_Value, char _Mask);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__
+unsigned long __cdecl _lrotl(unsigned long, int);
+static __inline__
+unsigned long __cdecl _lrotr(unsigned long, int);
+static __inline__
+static __inline__
+void _ReadBarrier(void);
+static __inline__
+void _ReadWriteBarrier(void);
+static __inline__
+void *_ReturnAddress(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+static __inline__
+unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
+static __inline__
+unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
+static __inline__
+unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
+static __inline__
+unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
+static __inline__
+unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__
+void _WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __popcnt64(unsigned __int64);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
+                                         void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+static __inline__
+__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
+                __int64 *_HighProduct);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmpex(jmp_buf);
+#endif
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+/*
+ * Multiply two 64-bit integers and obtain a 64-bit result.
+ * The low-half is returned directly and the high half is in an out parameter.
+ */
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
+         unsigned __int64 *_HighProduct) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  *_HighProduct = _FullProduct >> 64;
+  return _FullProduct;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
+  unsigned __int128 _FullProduct =
+      (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
+  return _FullProduct >> 64;
+}
+
+#endif /* __x86_64__ */
+
+/*----------------------------------------------------------------------------*\
+|* Multiplication
+\*----------------------------------------------------------------------------*/
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+__emul(int __in1, int __in2) {
+  return (__int64)__in1 * (__int64)__in2;
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__emulu(unsigned int __in1, unsigned int __in2) {
+  return (unsigned __int64)__in1 * (unsigned __int64)__in2;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Twiddling
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_rotl8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_rotr8(unsigned char _Value, unsigned char _Shift) {
+  _Shift &= 0x7;
+  return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+_rotl16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+_rotr16(unsigned short _Value, unsigned char _Shift) {
+  _Shift &= 0xf;
+  return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rotl(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rotr(unsigned int _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+_lrotl(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+_lrotr(unsigned long _Value, int _Shift) {
+  _Shift &= 0x1f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_rotl64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
+}
+static
+__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+_rotr64(unsigned __int64 _Value, int _Shift) {
+  _Shift &= 0x3f;
+  return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
+}
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 31 - __builtin_clzl(_Mask);
+  return 1;
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__popcnt16(unsigned short _Value) {
+  return __builtin_popcount((int)_Value);
+}
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__popcnt(unsigned int _Value) {
+  return __builtin_popcount(_Value);
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = __builtin_ctzll(_Mask);
+  return 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
+  if (!_Mask)
+    return 0;
+  *_Index = 63 - __builtin_clzll(_Mask);
+  return 1;
+}
+static __inline__
+unsigned __int64 __DEFAULT_FN_ATTRS
+__popcnt64(unsigned __int64 _Value) {
+  return __builtin_popcountll(_Value);
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Sub
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
+  return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8(char volatile *_Value, char _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16(short volatile *_Value, short _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd(long volatile *_Value, long _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_and_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8(char volatile *_Value, char _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16(short volatile *_Value, short _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr(long volatile *_Value, long _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_or_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8(char volatile *_Value, char _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16(short volatile *_Value, short _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor(long volatile *_Value, long _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_xor_fetch(_Value, _Mask, __ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+#ifdef __x86_64__
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return _Comparand;
+}
+/*----------------------------------------------------------------------------*\
+|* Barriers
+\*----------------------------------------------------------------------------*/
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void) {
+  __atomic_signal_fence(__ATOMIC_SEQ_CST);
+}
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__faststorefence(void) {
+  __atomic_thread_fence(__ATOMIC_SEQ_CST);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
+  __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
+                        : "%edi", "%esi", "%ecx");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
+                        : "%edi", "%ecx");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+static __inline__ void * __DEFAULT_FN_ATTRS
+_AddressOfReturnAddress(void) {
+  return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
+}
+static __inline__ void * __DEFAULT_FN_ATTRS
+_ReturnAddress(void) {
+  return __builtin_return_address(0);
+}
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */

diff --git a/25.0.2/clang-include/LICENSE.TXT b/25.0.2/clang-include/LICENSE.TXT
new file mode 100644
index 0000000..fc4afae
--- /dev/null
+++ b/25.0.2/clang-include/LICENSE.TXT

@@ -0,0 +1,63 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2007-2015 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+<none yet>
+

diff --git a/25.0.2/clang-include/__clang_cuda_runtime_wrapper.h b/25.0.2/clang-include/__clang_cuda_runtime_wrapper.h
new file mode 100644
index 0000000..8e5f033
--- /dev/null
+++ b/25.0.2/clang-include/__clang_cuda_runtime_wrapper.h

@@ -0,0 +1,216 @@
+/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ * CUDA headers are implemented in a way that currently makes it
+ * impossible for user code to #include directly when compiling with
+ * Clang. They present different view of CUDA-supplied functions
+ * depending on where in NVCC's compilation pipeline the headers are
+ * included. Neither of these modes provides function definitions with
+ * correct attributes, so we use preprocessor to force the headers
+ * into a form that Clang can use.
+ *
+ * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
+ * this file during every CUDA compilation.
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
+
+#if defined(__CUDA__) && defined(__clang__)
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <stdlib.h>
+#include <cmath>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific details of
+// CUDA-7.x headers and are not expected to work with any other
+// version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+
+#undef __CUDACC__
+#define __CUDABE__
+// Disables definitions of device-side runtime support stubs in
+// cuda_device_runtime_api.h
+#define __CUDADEVRT_INTERNAL__
+#include "host_config.h"
+#include "host_defines.h"
+#include "driver_types.h"
+#include "common_functions.h"
+#undef __CUDADEVRT_INTERNAL__
+
+#undef __CUDABE__
+#define __CUDACC__
+#include "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+
+// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
+// not have at the moment. Emulate them with a builtin memcpy/memset.
+#define __nvvm_memcpy(s,d,n,a) __builtin_memcpy(s,d,n)
+#define __nvvm_memset(d,c,n,a) __builtin_memset(d,c,n)
+
+#include "crt/host_runtime.h"
+#include "crt/device_runtime.h"
+// device_runtime.h defines __cxa_* macros that will conflict with
+// cxxabi.h.
+// FIXME: redefine these as __device__ functions.
+#undef __cxa_vec_ctor
+#undef __cxa_vec_cctor
+#undef __cxa_vec_dtor
+#undef __cxa_vec_new2
+#undef __cxa_vec_new3
+#undef __cxa_vec_delete2
+#undef __cxa_vec_delete
+#undef __cxa_vec_delete3
+#undef __cxa_pure_virtual
+
+// We need decls for functions in CUDA's libdevice with __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.hpp"
+#include "math_functions.hpp"
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// Pull in host-only functions that are only available when neither
+// __CUDACC__ nor __CUDABE__ are defined.
+#undef __MATH_FUNCTIONS_HPP__
+#undef __CUDABE__
+#include "math_functions.hpp"
+// Alas, additional overloads for these functions are hard to get to.
+// Considering that we only need these overloads for a few functions,
+// we can provide them here.
+static inline float rsqrt(float a) { return rsqrtf(a); }
+static inline float rcbrt(float a) { return rcbrtf(a); }
+static inline float sinpi(float a) { return sinpif(a); }
+static inline float cospi(float a) { return cospif(a); }
+static inline void sincospi(float a, float *b, float *c) {
+  return sincospi(a, b, c);
+}
+static inline float erfcinv(float a) { return erfcinvf(a); }
+static inline float normcdfinv(float a) { return normcdfinvf(a); }
+static inline float normcdf(float a) { return normcdff(a); }
+static inline float erfcx(float a) { return erfcxf(a); }
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_functions.hpp"
+#include "device_atomic_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_32_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+// sm_30_intrinsics.h has declarations that use default argument, so
+// we have to include it and it will in turn include .hpp
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.hpp"
+#undef __MATH_FUNCTIONS_HPP__
+#include "math_functions.hpp"
+#pragma pop_macro("__host__")
+
+#include "texture_indirect_functions.h"
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+#define __NVCC__
+
+#if defined(__CUDA_ARCH__)
+// We need to emit IR declaration for non-existing __nvvm_reflect() to
+// let backend know that it should be treated as const nothrow
+// function which is what NVVMReflect pass expects to see.
+extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
+static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
+  return __nvvm_reflect("NONE");
+}
+#endif
+
+#endif // __CUDA__
+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

diff --git a/25.0.2/clang-include/__stddef_max_align_t.h b/25.0.2/clang-include/__stddef_max_align_t.h
new file mode 100644
index 0000000..1e10ca9
--- /dev/null
+++ b/25.0.2/clang-include/__stddef_max_align_t.h

@@ -0,0 +1,43 @@
+/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+ *
+ * Copyright (c) 2014 Chandler Carruth
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_MAX_ALIGN_T_DEFINED
+#define __CLANG_MAX_ALIGN_T_DEFINED
+
+#if defined(_MSC_VER)
+typedef double max_align_t;
+#elif defined(__APPLE__)
+typedef long double max_align_t;
+#else
+// Define 'max_align_t' to match the GCC definition.
+typedef struct {
+  long long __clang_max_align_nonce1
+      __attribute__((__aligned__(__alignof__(long long))));
+  long double __clang_max_align_nonce2
+      __attribute__((__aligned__(__alignof__(long double))));
+} max_align_t;
+#endif
+
+#endif

diff --git a/25.0.2/clang-include/__wmmintrin_aes.h b/25.0.2/clang-include/__wmmintrin_aes.h
new file mode 100644
index 0000000..100799e
--- /dev/null
+++ b/25.0.2/clang-include/__wmmintrin_aes.h

@@ -0,0 +1,66 @@
+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_AES_H
+#define _WMMINTRIN_AES_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenc_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenc128(__V, __R);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenclast128(__V, __R);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdec_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdec128(__V, __R);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdeclast128(__V, __R);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesimc_si128(__m128i __V)
+{
+  return (__m128i)__builtin_ia32_aesimc128(__V);
+}
+
+#define _mm_aeskeygenassist_si128(C, R) \
+  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif  /* _WMMINTRIN_AES_H */

diff --git a/25.0.2/clang-include/__wmmintrin_pclmul.h b/25.0.2/clang-include/__wmmintrin_pclmul.h
new file mode 100644
index 0000000..68e944e
--- /dev/null
+++ b/25.0.2/clang-include/__wmmintrin_pclmul.h

@@ -0,0 +1,30 @@
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_PCLMUL_H
+#define _WMMINTRIN_PCLMUL_H
+
+#define _mm_clmulepi64_si128(__X, __Y, __I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
+                                        (__v2di)(__m128i)(__Y), (char)(__I)))
+
+#endif /* _WMMINTRIN_PCLMUL_H */

diff --git a/25.0.2/clang-include/adxintrin.h b/25.0.2/clang-include/adxintrin.h
new file mode 100644
index 0000000..ee34728
--- /dev/null
+++ b/25.0.2/clang-include/adxintrin.h

@@ -0,0 +1,86 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Intrinsics that are available only if __ADX__ defined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+               unsigned int *__p)
+{
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */

diff --git a/25.0.2/clang-include/altivec.h b/25.0.2/clang-include/altivec.h
new file mode 100644
index 0000000..c39156e
--- /dev/null
+++ b/25.0.2/clang-include/altivec.h

@@ -0,0 +1,13919 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ 0
+#define __CR6_EQ_REV 1
+#define __CR6_LT 2
+#define __CR6_LT_REV 3
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+static vector signed char __ATTRS_o_ai vec_perm(vector signed char __a,
+                                                vector signed char __b,
+                                                vector unsigned char __c);
+
+static vector unsigned char __ATTRS_o_ai vec_perm(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector unsigned char __c);
+
+static vector bool char __ATTRS_o_ai vec_perm(vector bool char __a,
+                                              vector bool char __b,
+                                              vector unsigned char __c);
+
+static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                          vector signed short __b,
+                                          vector unsigned char __c);
+
+static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector unsigned char __c);
+
+static vector bool short __ATTRS_o_ai vec_perm(vector bool short __a,
+                                               vector bool short __b,
+                                               vector unsigned char __c);
+
+static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
+                                          vector unsigned char __c);
+
+static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                        vector signed int __b,
+                                        vector unsigned char __c);
+
+static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector unsigned char __c);
+
+static vector bool int __ATTRS_o_ai vec_perm(vector bool int __a,
+                                             vector bool int __b,
+                                             vector unsigned char __c);
+
+static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b,
+                                          vector unsigned char __c);
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
+                                              vector signed long long __b,
+                                              vector unsigned char __c);
+
+static vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c);
+
+static vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c);
+
+static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
+                                           vector unsigned char __c);
+#endif
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
+                                                 vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi vec_abs
+#define __builtin_altivec_abs_v4si vec_abs
+
+static vector signed char __ATTRS_o_ai vec_abs(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_abs(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static vector signed int __ATTRS_o_ai vec_abs(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed long long __ATTRS_o_ai
+vec_abs(vector signed long long __a) {
+  return __builtin_altivec_vmaxsd(__a, -__a);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_abs(vector float __a) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector double __ATTRS_o_ai vec_abs(vector double __a) {
+  vector unsigned long long __res = { 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF };
+  __res &= (vector unsigned int)__a;
+  return (vector double)__res;
+}
+#endif
+
+/* vec_abss */
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi vec_abss
+#define __builtin_altivec_abss_v4si vec_abss
+
+static vector signed char __ATTRS_o_ai vec_abss(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(
+      __a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static vector signed short __ATTRS_o_ai vec_abss(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(
+      __a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static vector signed int __ATTRS_o_ai vec_abss(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(
+      __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_add */
+
+static vector signed char __ATTRS_o_ai vec_add(vector signed char __a,
+                                               vector signed char __b) {
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_add(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_add(vector signed char __a,
+                                               vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_add(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_add(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_add(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai vec_add(vector short __a, vector short __b) {
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_add(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_add(vector short __a,
+                                         vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_add(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_add(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_add(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai vec_add(vector int __a, vector int __b) {
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_add(vector bool int __a, vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_add(vector int __a, vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_add(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_add(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_add(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed long long __ATTRS_o_ai
+vec_add(vector signed long long __a, vector signed long long __b) {
+  return __a + __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_add(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a + __b;
+}
+
+static vector signed __int128 __ATTRS_o_ai vec_add(vector signed __int128 __a,
+                                                   vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+static vector float __ATTRS_o_ai vec_add(vector float __a, vector float __b) {
+  return __a + __b;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_add(vector double __a, vector double __b) {
+  return __a + __b;
+}
+#endif // __VSX__
+
+/* vec_adde */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_adde(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+#endif
+
+/* vec_addec */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_addec(vector signed __int128 __a, vector signed __int128 __b,
+          vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+          vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static vector signed char __ATTRS_o_ai vec_vaddubm(vector signed char __a,
+                                                   vector signed char __b) {
+  return __a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddubm(vector bool char __a,
+                                                   vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddubm(vector signed char __a,
+                                                   vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubm(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                             vector short __b) {
+  return __a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_vadduhm(vector bool short __a,
+                                             vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                             vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vadduhm(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static vector int __ATTRS_o_ai vec_vadduwm(vector int __a, vector int __b) {
+  return __a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                           vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                           vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduwm(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp vec_vaddfp
+
+static vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b) {
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static vector signed int __ATTRS_o_ai vec_addc(vector signed int __a,
+                                               vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vaddcuw((vector unsigned int)__a,
+                                                      (vector unsigned int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_addc(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai
+vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector signed __int128)__builtin_altivec_vaddcuq(
+    (vector unsigned __int128)__a,
+    (vector unsigned __int128)__b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vaddcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static vector signed char __ATTRS_o_ai vec_adds(vector signed char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_adds(vector bool char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_adds(vector signed char __a,
+                                                vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_adds(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_adds(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_adds(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_adds(vector short __a, vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                          vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                          vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_adds(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_adds(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_adds(vector int __a, vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_adds(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_adds(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_adds(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_adds(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_adds(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static vector signed char __ATTRS_o_ai vec_vaddsbs(vector signed char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddsbs(vector bool char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vaddsbs(vector signed char __a,
+                                                   vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vaddubs(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vaddshs(vector bool short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                             vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vadduhs(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static vector int __ATTRS_o_ai vec_vaddsws(vector int __a, vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vaddsws(vector bool int __a,
+                                           vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                           vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static vector unsigned int __ATTRS_o_ai vec_vadduws(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduws(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vadduws(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vadduqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+
+/* vec_vaddeuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+/* vec_vaddcuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+/* vec_vaddecuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static vector signed char __ATTRS_o_ai vec_and(vector signed char __a,
+                                               vector signed char __b) {
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_and(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_and(vector signed char __a,
+                                               vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_and(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_and(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_and(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_and(vector bool char __a,
+                                             vector bool char __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_and(vector short __a, vector short __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_and(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_and(vector short __a,
+                                         vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_and(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_and(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_and(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_and(vector bool short __a,
+                                              vector bool short __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_and(vector int __a, vector int __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_and(vector bool int __a, vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_and(vector int __a, vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_and(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_and(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_and(vector bool int __a,
+                                            vector bool int __b) {
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai vec_and(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_and(vector bool int __a,
+                                         vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_and(vector float __a,
+                                         vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_and(vector bool long long __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_and(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_and(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_and(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_and(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_vand */
+
+static vector signed char __ATTRS_o_ai vec_vand(vector signed char __a,
+                                                vector signed char __b) {
+  return __a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vand(vector signed char __a,
+                                                vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vand(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vand(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                              vector bool char __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_vand(vector short __a, vector short __b) {
+  return __a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                          vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                          vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vand(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vand(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                               vector bool short __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_vand(vector int __a, vector int __b) {
+  return __a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_vand(vector bool int __a, vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static vector int __ATTRS_o_ai vec_vand(vector int __a, vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vand(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vand(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                             vector bool int __b) {
+  return __a & __b;
+}
+
+static vector float __ATTRS_o_ai vec_vand(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vand(vector bool int __a,
+                                          vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                          vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vand(vector bool long long __a,
+                                                   vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static vector signed char __ATTRS_o_ai vec_andc(vector signed char __a,
+                                                vector signed char __b) {
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_andc(vector signed char __a,
+                                                vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_andc(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_andc(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                              vector bool char __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_andc(vector short __a, vector short __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                          vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                          vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_andc(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_andc(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                               vector bool short __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_andc(vector int __a, vector int __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_andc(vector bool int __a, vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_andc(vector int __a, vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_andc(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_andc(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                             vector bool int __b) {
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai vec_andc(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_andc(vector bool int __a,
+                                          vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                          vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai
+vec_andc(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_andc(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_andc(vector bool long long __a,
+                                                   vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_vandc */
+
+static vector signed char __ATTRS_o_ai vec_vandc(vector signed char __a,
+                                                 vector signed char __b) {
+  return __a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vandc(vector bool char __a,
+                                                 vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vandc(vector signed char __a,
+                                                 vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vandc(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vandc(vector bool char __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vandc(vector unsigned char __a,
+                                                   vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vandc(vector bool char __a,
+                                               vector bool char __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_vandc(vector short __a, vector short __b) {
+  return __a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                           vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                           vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vandc(vector unsigned short __a,
+                                                    vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                    vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vandc(vector unsigned short __a,
+                                                    vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                vector bool short __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_vandc(vector int __a, vector int __b) {
+  return __a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_vandc(vector bool int __a, vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static vector int __ATTRS_o_ai vec_vandc(vector int __a, vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vandc(vector unsigned int __a,
+                                                  vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vandc(vector unsigned int __a,
+                                                  vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                              vector bool int __b) {
+  return __a & ~__b;
+}
+
+static vector float __ATTRS_o_ai vec_vandc(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                           vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                           vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vandc(vector bool long long __a,
+                                                    vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_avg */
+
+static vector signed char __ATTRS_o_ai vec_avg(vector signed char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_avg(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_avg(vector short __a, vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_avg(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_avg(vector int __a, vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_avg(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static vector float __ATTRS_o_ai vec_ceil(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspip(__a);
+#else
+  return __builtin_altivec_vrfip(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_ceil(vector double __a) {
+  return __builtin_vsx_xvrdpip(__a);
+}
+#endif
+
+/* vec_vrfip */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a) {
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_cmpeq(vector unsigned char __a,
+                                               vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
+                                                vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpeq(vector unsigned short __a,
+                                                vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpeq(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+#endif
+
+static vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpeqsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpeqdp(__a, __b);
+}
+#endif
+
+
+/* vec_cmpgt */
+
+static vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_cmpgt(vector unsigned char __a,
+                                               vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpgt(vector short __a,
+                                                vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmpgt(vector unsigned short __a,
+                                                vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpgt(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmpgt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
+}
+#endif
+
+static vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgtsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgtdp(__a, __b);
+}
+#endif
+
+/* vec_cmpge */
+
+static vector bool char __ATTRS_o_ai
+vec_cmpge (vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmpge (vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpge (vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmpge (vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge (vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge (vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmpge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgesp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgedp(__a, __b);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+#endif
+
+/* vec_vcmpgefp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static vector bool char __ATTRS_o_ai
+vec_cmple (vector signed char __a, vector signed char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai
+vec_cmple (vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmple (vector signed short __a, vector signed short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai
+vec_cmple (vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple (vector signed int __a, vector signed int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple (vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai
+vec_cmple(vector float __a, vector float __b) {
+  return vec_cmpge(__b, __a);
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector double __a, vector double __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+/* vec_cmplt */
+
+static vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a,
+                                               vector signed char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool char __ATTRS_o_ai vec_cmplt(vector unsigned char __a,
+                                               vector unsigned char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
+                                                vector short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool short __ATTRS_o_ai vec_cmplt(vector unsigned short __a,
+                                                vector unsigned short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmplt(vector int __a, vector int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmplt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
+                                              vector float __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+#ifdef __VSX__
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector double __a, vector double __b) {
+  return vec_cmpgt(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+/* vec_cntlz */
+
+static vector signed char __ATTRS_o_ai vec_cntlz(vector signed char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static vector unsigned char __ATTRS_o_ai vec_cntlz(vector unsigned char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static vector signed short __ATTRS_o_ai vec_cntlz(vector signed short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static vector unsigned short __ATTRS_o_ai vec_cntlz(vector unsigned short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static vector signed int __ATTRS_o_ai vec_cntlz(vector signed int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static vector unsigned int __ATTRS_o_ai vec_cntlz(vector unsigned int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static vector signed long long __ATTRS_o_ai
+vec_cntlz(vector signed long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+#endif
+
+/* vec_cpsgn */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai vec_cpsgn(vector float __a, vector float __b) {
+  return __builtin_vsx_xvcpsgnsp(__a, __b);
+}
+
+static vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
+                                            vector double __b) {
+  return __builtin_vsx_xvcpsgndp(__a, __b);
+}
+#endif
+
+/* vec_ctf */
+
+static vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a, int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_vcfsx */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static vector int __attribute__((__always_inline__))
+vec_cts(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_vctsxs */
+
+static vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_ctu(vector float __a, int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_vctuxs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_double */
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_double (vector signed long long __a) {
+  vector double __ret = { __a[0], __a[1] };
+  return __ret;
+}
+
+static vector double __ATTRS_o_ai vec_double (vector unsigned long long __a) {
+  vector double __ret = { __a[0], __a[1] };
+  return __ret;
+}
+#endif
+
+/* vec_div */
+
+/* Integer vector divides (vectors are scalarized, elements divided
+   and the vectors reassembled).
+*/
+static vector signed char __ATTRS_o_ai vec_div(vector signed char __a,
+                                               vector signed char __b) {
+  return __a / __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_div(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a / __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_div(vector signed short __a,
+                                                vector signed short __b) {
+  return __a / __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_div(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a / __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_div(vector signed int __a,
+                                              vector signed int __b) {
+  return __a / __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_div(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a / __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_div(vector signed long long __a, vector signed long long __b) {
+  return __a / __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_div(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a / __b;
+}
+
+static vector float __ATTRS_o_ai vec_div(vector float __a, vector float __b) {
+  return __a / __b;
+}
+
+static vector double __ATTRS_o_ai vec_div(vector double __a,
+                                          vector double __b) {
+  return __a / __b;
+}
+#endif
+
+/* vec_dss */
+
+static void __attribute__((__always_inline__)) vec_dss(int __a) {
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static void __attribute__((__always_inline__)) vec_dssall(void) {
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+
+static void __attribute__((__always_inline__))
+vec_dst(const void *__a, int __b, int __c) {
+  __builtin_altivec_dst(__a, __b, __c);
+}
+
+/* vec_dstst */
+
+static void __attribute__((__always_inline__))
+vec_dstst(const void *__a, int __b, int __c) {
+  __builtin_altivec_dstst(__a, __b, __c);
+}
+
+/* vec_dststt */
+
+static void __attribute__((__always_inline__))
+vec_dststt(const void *__a, int __b, int __c) {
+  __builtin_altivec_dststt(__a, __b, __c);
+}
+
+/* vec_dstt */
+
+static void __attribute__((__always_inline__))
+vec_dstt(const void *__a, int __b, int __c) {
+  __builtin_altivec_dstt(__a, __b, __c);
+}
+
+/* vec_eqv */
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_eqv(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                  (vector unsigned int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_eqv(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                    (vector unsigned int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
+                                             vector bool char __b) {
+  return (vector bool char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                (vector unsigned int)__b);
+}
+
+static vector signed short __ATTRS_o_ai vec_eqv(vector signed short __a,
+                                                vector signed short __b) {
+  return (vector signed short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                   (vector unsigned int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_eqv(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_eqv(vector bool short __a,
+                                              vector bool short __b) {
+  return (vector bool short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector signed int __ATTRS_o_ai vec_eqv(vector signed int __a,
+                                              vector signed int __b) {
+  return (vector signed int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_eqv(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_vsx_xxleqv(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
+                                            vector bool int __b) {
+  return (vector bool int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    __builtin_vsx_xxleqv((vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_eqv(vector float __a, vector float __b) {
+  return (vector float)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                            (vector unsigned int)__b);
+}
+
+static vector double __ATTRS_o_ai vec_eqv(vector double __a,
+                                          vector double __b) {
+  return (vector double)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                             (vector unsigned int)__b);
+}
+#endif
+
+/* vec_expte */
+
+static vector float __attribute__((__always_inline__))
+vec_expte(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static vector float __ATTRS_o_ai vec_floor(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspim(__a);
+#else
+  return __builtin_altivec_vrfim(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_floor(vector double __a) {
+  return __builtin_vsx_xvrdpim(__a);
+}
+#endif
+
+/* vec_vrfim */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a) {
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static vector signed char __ATTRS_o_ai vec_ld(int __a,
+                                              const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_ld(int __a,
+                                                const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_ld(int __a,
+                                            const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ld(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ld(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_ld(int __a,
+                                                 const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_ld(int __a,
+                                             const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_ld(int __a, const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ld(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ld(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_ld(int __a,
+                                               const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_ld(int __a,
+                                               const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_ld(int __a,
+                                           const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ld(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ld(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static vector signed char __ATTRS_o_ai vec_lvx(int __a,
+                                               const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_lvx(int __a,
+                                               const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvx(int __a,
+                                                 const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_lvx(int __a,
+                                             const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvx(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvx(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvx(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_lvx(int __a,
+                                              const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_lvx(int __a, const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvx(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvx(int __a,
+                                                const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_lvx(int __a,
+                                            const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvx(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvx(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static vector signed char __ATTRS_o_ai vec_lde(int __a,
+                                               const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lde(int __a,
+                                                 const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lde(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lde(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lde(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lde(int __a,
+                                                const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lde(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static vector signed char __ATTRS_o_ai vec_lvebx(int __a,
+                                                 const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvebx(int __a,
+                                                   const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static vector short __ATTRS_o_ai vec_lvehx(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvehx(int __a,
+                                                    const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static vector int __ATTRS_o_ai vec_lvewx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvewx(int __a,
+                                                  const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvewx(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static vector signed char __ATTRS_o_ai vec_ldl(int __a,
+                                               const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_ldl(int __a,
+                                               const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_ldl(int __a,
+                                                 const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_ldl(int __a,
+                                             const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ldl(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_ldl(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_ldl(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_ldl(int __a,
+                                              const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_ldl(int __a, const vector pixel *__b) {
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ldl(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_ldl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_ldl(int __a,
+                                                const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_ldl(int __a,
+                                            const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ldl(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_ldl(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static vector signed char __ATTRS_o_ai vec_lvxl(int __a,
+                                                const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_lvxl(int __a,
+                                                const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvxl(int __a,
+                                                  const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_lvxl(int __a,
+                                              const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvxl(int __a, const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_lvxl(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvxl(int __a,
+                                                   const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_lvxl(int __a,
+                                               const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector pixel __ATTRS_o_ai vec_lvxl(int __a, const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvxl(int __a, const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_lvxl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvxl(int __a,
+                                                 const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_lvxl(int __a,
+                                             const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvxl(int __a, const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_lvxl(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static vector float __attribute__((__always_inline__))
+vec_loge(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                  const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsl(int __a, const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+/* vec_lvsr */
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                  const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static vector unsigned char __ATTRS_o_ai vec_lvsr(int __a, const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+/* vec_madd */
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector signed short, vector signed short);
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector unsigned short, vector unsigned short);
+static vector signed short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector signed short, vector signed short);
+static vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector unsigned short, vector unsigned short);
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector signed short __b,
+         vector signed short __c) {
+  return  vec_mladd(__a, __b, __c);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector signed short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector signed short __b,
+         vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai
+vec_madd(vector float __a, vector float __b, vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaddasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vmaddfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a, vector signed short __b,
+              vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_msub */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_msub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_vsx_xvmsubasp(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_max */
+
+static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_max(vector bool char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_max(vector signed char __a,
+                                               vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_max(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_max(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_max(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_max(vector short __a, vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_max(vector bool short __a,
+                                         vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_max(vector short __a,
+                                         vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_max(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_max(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_max(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_max(vector int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_max(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_max(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_max(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_max(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_max(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd((vector signed long long)__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai vec_max(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __builtin_altivec_vmaxsd(__a, (vector signed long long)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud((vector unsigned long long)__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vmaxud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_max(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_max(vector double __a,
+                                          vector double __b) {
+  return __builtin_vsx_xvmaxdp(__a, __b);
+}
+#endif
+
+/* vec_vmaxsb */
+
+static vector signed char __ATTRS_o_ai vec_vmaxsb(vector signed char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vmaxsb(vector bool char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vmaxsb(vector signed char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector bool char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmaxub(vector unsigned char __a,
+                                                    vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vmaxsh(vector bool short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                            vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vmaxuh(vector unsigned short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static vector int __ATTRS_o_ai vec_vmaxsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vmaxsw(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vmaxsw(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector bool int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmaxuw(vector unsigned int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+/* vec_mergeh */
+
+static vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mergeh(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai vec_mergeh(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector short __ATTRS_o_ai vec_mergeh(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai vec_mergeh(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai vec_mergeh(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector int __ATTRS_o_ai vec_mergeh(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mergeh(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai vec_mergeh(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai vec_mergeh(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                 (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                        0x04, 0x05, 0x06, 0x07,
+                                        0x10, 0x11, 0x12, 0x13,
+                                        0x14, 0x15, 0x16, 0x17));
+}
+
+static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                             vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                             vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static vector double __ATTRS_o_ai vec_mergeh(vector bool long long __a,
+                                             vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03,
+                                         0x04, 0x05, 0x06, 0x07,
+                                         0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+#endif
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static vector signed char __ATTRS_o_ai vec_vmrghb(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmrghb(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static vector bool char __ATTRS_o_ai vec_vmrghb(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static vector short __ATTRS_o_ai vec_vmrghh(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector bool short __ATTRS_o_ai vec_vmrghh(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static vector pixel __ATTRS_o_ai vec_vmrghh(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmrghw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector bool int __ATTRS_o_ai vec_vmrghw(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static vector float __ATTRS_o_ai vec_vmrghw(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mergel(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai vec_mergel(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector short __ATTRS_o_ai vec_mergel(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai vec_mergel(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai vec_mergel(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector int __ATTRS_o_ai vec_mergel(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mergel(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai vec_mergel(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai vec_mergel(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector signed long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector bool long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static vector double __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B,
+                                         0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+#endif
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static vector signed char __ATTRS_o_ai vec_vmrglb(vector signed char __a,
+                                                  vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vmrglb(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static vector bool char __ATTRS_o_ai vec_vmrglb(vector bool char __a,
+                                                vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static vector short __ATTRS_o_ai vec_vmrglh(vector short __a,
+                                            vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector bool short __ATTRS_o_ai vec_vmrglh(vector bool short __a,
+                                                 vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static vector pixel __ATTRS_o_ai vec_vmrglh(vector pixel __a,
+                                            vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vmrglw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector bool int __ATTRS_o_ai vec_vmrglw(vector bool int __a,
+                                               vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector float __ATTRS_o_ai vec_vmrglw(vector float __a,
+                                            vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+
+#ifdef __POWER8_VECTOR__
+/* vec_mergee */
+
+static vector bool int __ATTRS_o_ai
+vec_mergee(vector bool int __a, vector bool int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static vector signed int __ATTRS_o_ai
+vec_mergee(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_mergee(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                   0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
+}
+
+/* vec_mergeo */
+
+static vector bool int  __ATTRS_o_ai
+vec_mergeo(vector bool int __a, vector bool int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector signed int  __ATTRS_o_ai
+vec_mergeo(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static vector unsigned int  __ATTRS_o_ai
+vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b, (vector unsigned char)
+                  (0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                   0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#endif
+
+/* vec_mfvscr */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void) {
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static vector signed char __ATTRS_o_ai vec_min(vector signed char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_min(vector bool char __a,
+                                               vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_min(vector signed char __a,
+                                               vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_min(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_min(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_min(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_min(vector short __a, vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_min(vector bool short __a,
+                                         vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_min(vector short __a,
+                                         vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_min(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_min(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_min(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_min(vector int __a, vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_min(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_min(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_min(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_min(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_min(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd((vector signed long long)__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai vec_min(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __builtin_altivec_vminsd(__a, (vector signed long long)__b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud((vector unsigned long long)__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vminud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_min(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_min(vector double __a,
+                                          vector double __b) {
+  return __builtin_vsx_xvmindp(__a, __b);
+}
+#endif
+
+/* vec_vminsb */
+
+static vector signed char __ATTRS_o_ai vec_vminsb(vector signed char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vminsb(vector bool char __a,
+                                                  vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vminsb(vector signed char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static vector unsigned char __ATTRS_o_ai vec_vminub(vector unsigned char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vminub(vector bool char __a,
+                                                    vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vminub(vector unsigned char __a,
+                                                    vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vminsh(vector bool short __a,
+                                            vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                            vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vminuh(vector unsigned short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static vector int __ATTRS_o_ai vec_vminsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vminsw(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vminsw(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static vector unsigned int __ATTRS_o_ai vec_vminuw(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vminuw(vector bool int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vminuw(vector unsigned int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static vector short __ATTRS_o_ai vec_mladd(vector short __a, vector short __b,
+                                           vector short __c) {
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai vec_mladd(vector short __a,
+                                           vector unsigned short __b,
+                                           vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                           vector short __b, vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                                    vector unsigned short __b,
+                                                    vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                               vector short __b,
+                                               vector short __c) {
+  return __a * __b + __c;
+}
+
+static vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                               vector unsigned short __b,
+                                               vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static vector short __ATTRS_o_ai vec_vmladduhm(vector unsigned short __a,
+                                               vector short __b,
+                                               vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static vector int __ATTRS_o_ai vec_msum(vector signed char __a,
+                                        vector unsigned char __b,
+                                        vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_msum(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai vec_msum(vector short __a, vector short __b,
+                                        vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_msum(vector unsigned short __a,
+                                                 vector unsigned short __b,
+                                                 vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static vector int __ATTRS_o_ai vec_msums(vector short __a, vector short __b,
+                                         vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_msums(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static void __ATTRS_o_ai vec_mtvscr(vector signed char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector unsigned char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector bool char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector unsigned short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector bool short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector pixel __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector unsigned int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector bool int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static void __ATTRS_o_ai vec_mtvscr(vector float __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* vec_mul */
+
+/* Integer vector multiplication will involve multiplication of the odd/even
+   elements separately, then truncating the results and moving to the
+   result vector.
+*/
+static vector signed char __ATTRS_o_ai vec_mul(vector signed char __a,
+                                               vector signed char __b) {
+  return __a * __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_mul(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a * __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_mul(vector signed short __a,
+                                                vector signed short __b) {
+  return __a * __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mul(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a * __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_mul(vector signed int __a,
+                                              vector signed int __b) {
+  return __a * __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mul(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_mul(vector signed long long __a, vector signed long long __b) {
+  return __a * __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mul(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a * __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_mul(vector float __a, vector float __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_mul(vector double __a, vector double __b) {
+  return __a * __b;
+}
+#endif
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static vector short __ATTRS_o_ai vec_mule(vector signed char __a,
+                                          vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mule(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_mule(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mule(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai vec_mule(vector signed int __a,
+                                                     vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosw(__a, __b);
+#else
+  return __builtin_altivec_vmulesw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouw(__a, __b);
+#else
+  return __builtin_altivec_vmuleuw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulesb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
+                                          vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_mulo(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_mulo(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_mulo(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai vec_mulo(vector signed int __a,
+                                                     vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesw(__a, __b);
+#else
+  return __builtin_altivec_vmulosw(__a, __b);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuw(__a, __b);
+#else
+  return __builtin_altivec_vmulouw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulosb */
+
+static vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/*  vec_nand */
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
+                                                vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_nand(vector signed char __a,
+                                                vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return ~(__a & __b);
+
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                              vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
+                                                 vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector signed short __a,
+                                                 vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static vector signed short __ATTRS_o_ai vec_nand(vector bool short __a,
+                                                 vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nand(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return ~(__a & __b);
+
+}
+
+static vector bool short __ATTRS_o_ai vec_nand(vector bool short __a,
+                                               vector bool short __b) {
+  return ~(__a & __b);
+
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                               vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                               vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                               vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                             vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+#endif
+
+/* vec_nmadd */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai
+vec_nmadd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_vsx_xvnmaddasp(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai
+vec_nmadd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvnmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_nmsub */
+
+static vector float __ATTRS_o_ai
+vec_nmsub(vector float __a, vector float __b, vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvnmsubasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_nmsub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_vsx_xvnmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vnmsubfp */
+
+static vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static vector signed char __ATTRS_o_ai vec_nor(vector signed char __a,
+                                               vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_nor(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_nor(vector bool char __a,
+                                             vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai vec_nor(vector short __a, vector short __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_nor(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_nor(vector bool short __a,
+                                              vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai vec_nor(vector int __a, vector int __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_nor(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_nor(vector bool int __a,
+                                            vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai vec_nor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai
+vec_nor(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      ~((vector unsigned long long)__a | (vector unsigned long long)__b);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vnor */
+
+static vector signed char __ATTRS_o_ai vec_vnor(vector signed char __a,
+                                                vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vnor(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vnor(vector bool char __a,
+                                              vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static vector short __ATTRS_o_ai vec_vnor(vector short __a, vector short __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vnor(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vnor(vector bool short __a,
+                                               vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static vector int __ATTRS_o_ai vec_vnor(vector int __a, vector int __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vnor(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vnor(vector bool int __a,
+                                             vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static vector float __ATTRS_o_ai vec_vnor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static vector bool long long __ATTRS_o_ai vec_nor(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return ~(__a | __b);
+}
+#endif
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                              vector signed char __b) {
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_or(vector bool char __a,
+                                              vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                              vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_or(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_or(vector bool char __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_or(vector unsigned char __a,
+                                                vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_or(vector bool char __a,
+                                            vector bool char __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_or(vector short __a, vector short __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_or(vector bool short __a,
+                                        vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_or(vector short __a,
+                                        vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_or(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_or(vector unsigned short __a,
+                                                 vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_or(vector bool short __a,
+                                             vector bool short __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_or(vector int __a, vector int __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_or(vector bool int __a, vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_or(vector int __a, vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_or(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_or(vector bool int __a,
+                                               vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_or(vector unsigned int __a,
+                                               vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_or(vector bool int __a,
+                                           vector bool int __b) {
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai vec_or(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_or(vector bool int __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_or(vector float __a, vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_or(vector bool long long __a,
+                                         vector double __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static vector double __ATTRS_o_ai vec_or(vector double __a,
+                                         vector bool long long __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static vector double __ATTRS_o_ai vec_or(vector double __a, vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_or(vector signed long long __a,
+                                                   vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_or(vector bool long long __a,
+                                                 vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
+                                               vector signed char __b) {
+  return __a | ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_orc(vector signed char __a,
+                                               vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector signed char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                               vector signed char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                             vector bool char __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
+                                                vector signed short __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector signed short __a,
+                                                vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector signed short __ATTRS_o_ai vec_orc(vector bool short __a,
+                                                vector signed short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_orc(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_orc(vector bool short __a,
+                                              vector bool short __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                              vector signed int __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                              vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector signed int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                              vector signed int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                                vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                            vector bool int __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_orc(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+#endif
+
+/* vec_vor */
+
+static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
+                                               vector signed char __b) {
+  return __a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vor(vector signed char __a,
+                                               vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vor(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vor(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                             vector bool char __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_vor(vector short __a, vector short __b) {
+  return __a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                         vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vor(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vor(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                              vector bool short __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_vor(vector int __a, vector int __b) {
+  return __a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_vor(vector bool int __a, vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static vector int __ATTRS_o_ai vec_vor(vector int __a, vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vor(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vor(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                            vector bool int __b) {
+  return __a | __b;
+}
+
+static vector float __ATTRS_o_ai vec_vor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vor(vector bool int __a,
+                                         vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                         vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_vor(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vor(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static vector signed char __ATTRS_o_ai vec_pack(vector signed short __a,
+                                                vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_pack(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_pack(vector bool short __a,
+                                              vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector short __ATTRS_o_ai vec_pack(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_pack(vector unsigned int __a,
+                                                   vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_pack(vector bool int __a,
+                                               vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+#ifdef __VSX__
+static vector signed int __ATTRS_o_ai vec_pack(vector signed long long __a,
+                                               vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+static vector unsigned int __ATTRS_o_ai
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_pack(vector bool long long __a,
+                                             vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+#endif
+
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static vector signed char __ATTRS_o_ai vec_vpkuhum(vector signed short __a,
+                                                   vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_vpkuhum(vector bool short __a,
+                                                 vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static vector short __ATTRS_o_ai vec_vpkuwum(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vpkuwum(vector unsigned int __a,
+                                                      vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_vpkuwum(vector bool int __a,
+                                                  vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkudum */
+
+#ifdef __POWER8_VECTOR__
+#define __builtin_altivec_vpkudum vec_vpkudum
+
+static vector int __ATTRS_o_ai vec_vpkudum(vector long long __a,
+                                           vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_vpkudum(vector bool long long __a,
+                                                vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+#endif
+
+/* vec_packpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static vector signed char __ATTRS_o_ai vec_packs(vector short __a,
+                                                 vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_packs(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector signed short __ATTRS_o_ai vec_packs(vector int __a,
+                                                  vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_packs(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector int __ATTRS_o_ai vec_packs(vector long long __a,
+                                         vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshss */
+
+static vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpksdss */
+
+#ifdef __POWER8_VECTOR__
+static vector int __ATTRS_o_ai vec_vpksdss(vector long long __a,
+                                           vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkuhus */
+
+static vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkudus */
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __attribute__((__always_inline__))
+vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkswss */
+
+static vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a,
+                                                    vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_packsu(vector unsigned short __a,
+                                                    vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_packsu(vector int __a,
+                                                     vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_packsu(vector unsigned int __a,
+                                                     vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __ATTRS_o_ai vec_packsu(vector long long __a,
+                                                   vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshus */
+
+static vector unsigned char __ATTRS_o_ai vec_vpkshus(vector short __a,
+                                                     vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static vector unsigned short __ATTRS_o_ai vec_vpkswus(vector int __a,
+                                                      vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vpkswus(vector unsigned int __a,
+                                                      vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpksdus */
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned int __ATTRS_o_ai vec_vpksdus(vector long long __a,
+                                                    vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+static vector signed char __ATTRS_o_ai vec_perm(vector signed char __a,
+                                                vector signed char __b,
+                                                vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                         (vector int)__a, __d);
+#else
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                         (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_perm(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_perm(vector bool char __a,
+                                              vector bool char __b,
+                                              vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                       (vector int)__a, __d);
+#else
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                       (vector int)__b, __c);
+#endif
+}
+
+static vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                          vector signed short __b,
+                                          vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_perm(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_perm(vector bool short __a,
+                                               vector bool short __b,
+                                               vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                        (vector int)__a, __d);
+#else
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                        (vector int)__b, __c);
+#endif
+}
+
+static vector pixel __ATTRS_o_ai vec_perm(vector pixel __a, vector pixel __b,
+                                          vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                        vector signed int __b,
+                                        vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector signed int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_perm(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_perm(vector bool int __a,
+                                             vector bool int __b,
+                                             vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                      (vector int)__a, __d);
+#else
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                      (vector int)__b, __c);
+#endif
+}
+
+static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b,
+                                          vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai vec_perm(vector signed long long __a,
+                                              vector signed long long __b,
+                                              vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static vector double __ATTRS_o_ai vec_perm(vector double __a, vector double __b,
+                                           vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__b,
+                                                    (vector int)__a, __d);
+#else
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__a,
+                                                    (vector int)__b, __c);
+#endif
+}
+#endif
+
+/* vec_vperm */
+
+static vector signed char __ATTRS_o_ai vec_vperm(vector signed char __a,
+                                                 vector signed char __b,
+                                                 vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vperm(vector unsigned char __a,
+                                                   vector unsigned char __b,
+                                                   vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool char __ATTRS_o_ai vec_vperm(vector bool char __a,
+                                               vector bool char __b,
+                                               vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector short __ATTRS_o_ai vec_vperm(vector short __a, vector short __b,
+                                           vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vperm(vector unsigned short __a,
+                                                    vector unsigned short __b,
+                                                    vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool short __ATTRS_o_ai vec_vperm(vector bool short __a,
+                                                vector bool short __b,
+                                                vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector pixel __ATTRS_o_ai vec_vperm(vector pixel __a, vector pixel __b,
+                                           vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector int __ATTRS_o_ai vec_vperm(vector int __a, vector int __b,
+                                         vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vperm(vector unsigned int __a,
+                                                  vector unsigned int __b,
+                                                  vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector bool int __ATTRS_o_ai vec_vperm(vector bool int __a,
+                                              vector bool int __b,
+                                              vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector float __ATTRS_o_ai vec_vperm(vector float __a, vector float __b,
+                                           vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+#ifdef __VSX__
+static vector long long __ATTRS_o_ai vec_vperm(vector long long __a,
+                                               vector long long __b,
+                                               vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vperm(vector unsigned long long __a, vector unsigned long long __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static vector double __ATTRS_o_ai vec_vperm(vector double __a,
+                                            vector double __b,
+                                            vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+#endif
+
+/* vec_re */
+
+static vector float __ATTRS_o_ai
+vec_re(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvresp(__a);
+#else
+  return __builtin_altivec_vrefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_re(vector double __a) {
+  return __builtin_vsx_xvredp(__a);
+}
+#endif
+
+/* vec_vrefp */
+
+static vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a) {
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static vector signed char __ATTRS_o_ai vec_rl(vector signed char __a,
+                                              vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_rl(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_rl(vector short __a,
+                                        vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_rl(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_rl(vector int __a, vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_rl(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+#endif
+
+/* vec_vrlb */
+
+static vector signed char __ATTRS_o_ai vec_vrlb(vector signed char __a,
+                                                vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vrlb(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static vector short __ATTRS_o_ai vec_vrlh(vector short __a,
+                                          vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vrlh(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static vector int __ATTRS_o_ai vec_vrlw(vector int __a,
+                                        vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vrlw(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static vector float __ATTRS_o_ai vec_round(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspi(__a);
+#else
+  return __builtin_altivec_vrfin(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_round(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+
+/* vec_rint */
+
+static vector float __ATTRS_o_ai
+vec_rint(vector float __a) {
+  return __builtin_vsx_xvrspic(__a);
+}
+
+static vector double __ATTRS_o_ai
+vec_rint(vector double __a) {
+  return __builtin_vsx_xvrdpic(__a);
+}
+
+/* vec_nearbyint */
+
+static vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
+  return __builtin_vsx_xvrspi(__a);
+}
+
+static vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+#endif
+
+/* vec_vrfin */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a) {
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_sqrt */
+
+#ifdef __VSX__
+static vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
+  return __builtin_vsx_xvsqrtsp(__a);
+}
+
+static vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
+  return __builtin_vsx_xvsqrtdp(__a);
+}
+#endif
+
+/* vec_rsqrte */
+
+static vector float __ATTRS_o_ai
+vec_rsqrte(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrsqrtesp(__a);
+#else
+  return __builtin_altivec_vrsqrtefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
+  return __builtin_vsx_xvrsqrtedp(__a);
+}
+#endif
+
+/* vec_vrsqrtefp */
+
+static __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a) {
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static vector signed char __ATTRS_o_ai vec_sel(vector signed char __a,
+                                               vector signed char __b,
+                                               vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai vec_sel(vector signed char __a,
+                                               vector signed char __b,
+                                               vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sel(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sel(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                             vector bool char __b,
+                                             vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                             vector bool char __b,
+                                             vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai vec_sel(vector short __a, vector short __b,
+                                         vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai vec_sel(vector short __a, vector short __b,
+                                         vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sel(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sel(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_sel(vector bool short __a,
+                                              vector bool short __b,
+                                              vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_sel(vector bool short __a,
+                                              vector bool short __b,
+                                              vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai vec_sel(vector int __a, vector int __b,
+                                       vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai vec_sel(vector int __a, vector int __b,
+                                       vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sel(vector unsigned int __a,
+                                                vector unsigned int __b,
+                                                vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sel(vector unsigned int __a,
+                                                vector unsigned int __b,
+                                                vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                            vector bool int __b,
+                                            vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                            vector bool int __b,
+                                            vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai vec_sel(vector float __a, vector float __b,
+                                         vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_sel(vector float __a, vector float __b,
+                                         vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
+                                          vector bool long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                     ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+
+static vector double __ATTRS_o_ai vec_sel(vector double __a, vector double __b,
+                                          vector unsigned long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                     ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vsel */
+
+static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
+                                                vector signed char __b,
+                                                vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsel(vector signed char __a,
+                                                vector signed char __b,
+                                                vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsel(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsel(vector unsigned char __a,
+                                                  vector unsigned char __b,
+                                                  vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                              vector bool char __b,
+                                              vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                              vector bool char __b,
+                                              vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector short __ATTRS_o_ai vec_vsel(vector short __a, vector short __b,
+                                          vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector short __ATTRS_o_ai vec_vsel(vector short __a, vector short __b,
+                                          vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsel(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsel(vector unsigned short __a,
+                                                   vector unsigned short __b,
+                                                   vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsel(vector bool short __a,
+                                               vector bool short __b,
+                                               vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsel(vector bool short __a,
+                                               vector bool short __b,
+                                               vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector int __ATTRS_o_ai vec_vsel(vector int __a, vector int __b,
+                                        vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector int __ATTRS_o_ai vec_vsel(vector int __a, vector int __b,
+                                        vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsel(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsel(vector unsigned int __a,
+                                                 vector unsigned int __b,
+                                                 vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                             vector bool int __b,
+                                             vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                             vector bool int __b,
+                                             vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static vector float __ATTRS_o_ai vec_vsel(vector float __a, vector float __b,
+                                          vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vsel(vector float __a, vector float __b,
+                                          vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static vector signed char __ATTRS_o_ai vec_sl(vector signed char __a,
+                                              vector unsigned char __b) {
+  return __a << (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return __a << __b;
+}
+
+static vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                        vector unsigned short __b) {
+  return __a << (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sl(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return __a << __b;
+}
+
+static vector int __ATTRS_o_ai vec_sl(vector int __a, vector unsigned int __b) {
+  return __a << (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sl(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return __a << __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sl(vector signed long long __a, vector unsigned long long __b) {
+  return __a << (vector long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a << __b;
+}
+#endif
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static vector signed char __ATTRS_o_ai vec_vslb(vector signed char __a,
+                                                vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vslb(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static vector short __ATTRS_o_ai vec_vslh(vector short __a,
+                                          vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vslh(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static vector int __ATTRS_o_ai vec_vslw(vector int __a,
+                                        vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vslw(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static vector signed char __ATTRS_o_ai vec_sld(vector signed char __a,
+                                               vector signed char __b,
+                                               unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sld(vector unsigned char __a,
+                                                 vector unsigned char __b,
+                                                 unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool char __ATTRS_o_ai vec_sld(vector bool char __a,
+                                             vector bool char __b,
+                                             unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector signed short __ATTRS_o_ai vec_sld(vector signed short __a,
+                                                vector signed short __b,
+                                                unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sld(vector unsigned short __a,
+                                                  vector unsigned short __b,
+                                                  unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_sld(vector bool short __a,
+                                              vector bool short __b,
+                                              unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai vec_sld(vector pixel __a, vector pixel __b,
+                                         unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector signed int __ATTRS_o_ai vec_sld(vector signed int __a,
+                                              vector signed int __b,
+                                              unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sld(vector unsigned int __a,
+                                                vector unsigned int __b,
+                                                unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
+                                            vector bool int __b,
+                                            unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector float __ATTRS_o_ai vec_sld(vector float __a, vector float __b,
+                                         unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+/* vec_vsldoi */
+
+static vector signed char __ATTRS_o_ai vec_vsldoi(vector signed char __a,
+                                                  vector signed char __b,
+                                                  unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsldoi(vector unsigned char __a,
+                                                    vector unsigned char __b,
+                                                    unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector short __ATTRS_o_ai vec_vsldoi(vector short __a, vector short __b,
+                                            unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsldoi(vector unsigned short __a,
+                                                     vector unsigned short __b,
+                                                     unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a, vector pixel __b,
+                                            unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_vsldoi(vector int __a, vector int __b,
+                                          unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsldoi(vector unsigned int __a,
+                                                   vector unsigned int __b,
+                                                   unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static vector float __ATTRS_o_ai vec_vsldoi(vector float __a, vector float __b,
+                                            unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a,
+      (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, 20 - __d,
+                             21 - __d, 22 - __d, 23 - __d, 24 - __d, 25 - __d,
+                             26 - __d, 27 - __d, 28 - __d, 29 - __d, 30 - __d,
+                             31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+/* vec_sll */
+
+static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_sll(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sll(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_sll(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sll(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_sll(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sll(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_sll(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_vsl */
+
+static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsl(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsl(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsl(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsl(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsl(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsl(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsl(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_slo */
+
+static vector signed char __ATTRS_o_ai vec_slo(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_slo(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_slo(vector unsigned char __a,
+                                                 vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_slo(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                         vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_slo(vector unsigned short __a,
+                                                  vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_slo(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                         vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_slo(vector int __a, vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_slo(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_slo(vector unsigned int __a,
+                                                vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_slo(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                         vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                         vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_vslo */
+
+static vector signed char __ATTRS_o_ai vec_vslo(vector signed char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vslo(vector signed char __a,
+                                                vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vslo(vector unsigned char __a,
+                                                  vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vslo(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                          vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                          vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vslo(vector unsigned short __a,
+                                                   vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vslo(vector unsigned short __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                          vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                          vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                        vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                        vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vslo(vector unsigned int __a,
+                                                 vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vslo(vector unsigned int __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                          vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                          vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static vector signed char __ATTRS_o_ai vec_splat(vector signed char __a,
+                                                 unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_splat(vector unsigned char __a,
+                                                   unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static vector bool char __ATTRS_o_ai vec_splat(vector bool char __a,
+                                               unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static vector signed short __ATTRS_o_ai vec_splat(vector signed short __a,
+                                                  unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_splat(vector unsigned short __a,
+                                                    unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector bool short __ATTRS_o_ai vec_splat(vector bool short __a,
+                                                unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
+                                           unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1, b0, b1));
+}
+
+static vector signed int __ATTRS_o_ai vec_splat(vector signed int __a,
+                                                unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_splat(vector unsigned int __a,
+                                                  unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai vec_splat(vector bool int __a,
+                                              unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai vec_splat(vector float __a,
+                                           unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0,
+                                         b1, b2, b3, b0, b1, b2, b3));
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_splat(vector double __a,
+                                            unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector bool long long __ATTRS_o_ai vec_splat(vector bool long long __a,
+                                                    unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector signed long long __ATTRS_o_ai
+vec_splat(vector signed long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+static vector unsigned long long __ATTRS_o_ai
+vec_splat(vector unsigned long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4,
+                b5 = b0 + 5, b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7,
+                                         b0, b1, b2, b3, b4, b5, b6, b7));
+}
+#endif
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static vector signed char __ATTRS_o_ai vec_vspltb(vector signed char __a,
+                                                  unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vspltb(vector unsigned char __a,
+                                                    unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_vspltb(vector bool char __a,
+                                                unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static vector short __ATTRS_o_ai vec_vsplth(vector short __a,
+                                            unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsplth(vector unsigned short __a,
+                                                     unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector bool short __ATTRS_o_ai vec_vsplth(vector bool short __a,
+                                                 unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static vector pixel __ATTRS_o_ai vec_vsplth(vector pixel __a,
+                                            unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static vector int __ATTRS_o_ai vec_vspltw(vector int __a, unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vspltw(vector unsigned int __a,
+                                                   unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector bool int __ATTRS_o_ai vec_vspltw(vector bool int __a,
+                                               unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static vector float __ATTRS_o_ai vec_vspltw(vector float __a,
+                                            unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai vec_splat_s8(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector signed char __ATTRS_o_ai vec_vspltisb(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai vec_splat_s16(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector short __ATTRS_o_ai vec_vspltish(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai vec_splat_s32(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector int __ATTRS_o_ai vec_vspltisw(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned char __ATTRS_o_ai vec_splat_u8(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned short __ATTRS_o_ai vec_splat_u16(signed char __a) {
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static vector unsigned int __ATTRS_o_ai vec_splat_u32(signed char __a) {
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static vector signed char __ATTRS_o_ai vec_sr(vector signed char __a,
+                                              vector unsigned char __b) {
+  vector unsigned char __res = (vector unsigned char)__a >> __b;
+  return (vector signed char)__res;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a,
+                                                vector unsigned char __b) {
+  return __a >> __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_sr(vector signed short __a,
+                                        vector unsigned short __b) {
+  vector unsigned short __res = (vector unsigned short)__a >> __b;
+  return (vector signed short)__res;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sr(vector unsigned short __a,
+                                                 vector unsigned short __b) {
+  return __a >> __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_sr(vector signed int __a,
+                                             vector unsigned int __b) {
+  vector unsigned int __res = (vector unsigned int)__a >> __b;
+  return (vector signed int)__res;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sr(vector unsigned int __a,
+                                               vector unsigned int __b) {
+  return __a >> __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sr(vector signed long long __a, vector unsigned long long __b) {
+  vector unsigned long long __res = (vector unsigned long long)__a >> __b;
+  return (vector signed long long)__res;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+#endif
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static vector signed char __ATTRS_o_ai vec_vsrb(vector signed char __a,
+                                                vector unsigned char __b) {
+  return __a >> (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsrb(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static vector short __ATTRS_o_ai vec_vsrh(vector short __a,
+                                          vector unsigned short __b) {
+  return __a >> (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsrh(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static vector int __ATTRS_o_ai vec_vsrw(vector int __a,
+                                        vector unsigned int __b) {
+  return __a >> (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsrw(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static vector signed char __ATTRS_o_ai vec_sra(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sra(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_sra(vector short __a,
+                                         vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sra(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_sra(vector int __a,
+                                       vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sra(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static vector signed long long __ATTRS_o_ai
+vec_sra(vector signed long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)((vector signed long long)__a >> __b);
+}
+#endif
+
+/* vec_vsrab */
+
+static vector signed char __ATTRS_o_ai vec_vsrab(vector signed char __a,
+                                                 vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsrab(vector unsigned char __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static vector short __ATTRS_o_ai vec_vsrah(vector short __a,
+                                           vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsrah(vector unsigned short __a,
+                                                    vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static vector int __ATTRS_o_ai vec_vsraw(vector int __a,
+                                         vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsraw(vector unsigned int __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_srl(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_srl(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_srl(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_srl(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_srl(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_srl(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_srl(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_vsr */
+
+static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
+                                               vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsr(vector signed char __a,
+                                               vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
+                                                 vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsr(vector unsigned char __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
+                                             vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
+                                             vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector bool char __ATTRS_o_ai vec_vsr(vector bool char __a,
+                                             vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                         vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                         vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsr(vector unsigned short __a,
+                                                  vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
+                                              vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
+                                              vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector bool short __ATTRS_o_ai vec_vsr(vector bool short __a,
+                                              vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                         vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                         vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                       vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                       vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
+                                                vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsr(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
+                                            vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
+                                            vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static vector bool int __ATTRS_o_ai vec_vsr(vector bool int __a,
+                                            vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_sro */
+
+static vector signed char __ATTRS_o_ai vec_sro(vector signed char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_sro(vector signed char __a,
+                                               vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sro(vector unsigned char __a,
+                                                 vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sro(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                         vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                         vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sro(vector unsigned short __a,
+                                                  vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sro(vector unsigned short __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                         vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                         vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sro(vector int __a, vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_sro(vector int __a,
+                                       vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sro(vector unsigned int __a,
+                                                vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sro(vector unsigned int __a,
+                                                vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                         vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                         vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_vsro */
+
+static vector signed char __ATTRS_o_ai vec_vsro(vector signed char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsro(vector signed char __a,
+                                                vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsro(vector unsigned char __a,
+                                                  vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsro(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                          vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                          vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsro(vector unsigned short __a,
+                                                   vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsro(vector unsigned short __a,
+                                                   vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                          vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                          vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                        vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                        vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsro(vector unsigned int __a,
+                                                 vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsro(vector unsigned int __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                          vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                          vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector short __a, int __b, vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector pixel __a, int __b, vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector float __a, int __b, vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_st(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                  vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                  signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                  vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                  unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                  signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                  unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                  vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                  vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                  vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                  unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                  unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                  vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                  unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                  vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                  vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                  unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                  unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                  vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                  vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvx(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static void __ATTRS_o_ai vec_ste(vector signed char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector unsigned char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector unsigned short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector unsigned int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_ste(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static void __ATTRS_o_ai vec_stvebx(vector signed char __a, int __b,
+                                    signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvebx(vector unsigned char __a, int __b,
+                                    unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                    signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                    unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static void __ATTRS_o_ai vec_stvehx(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector unsigned short __a, int __b,
+                                    unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                    short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                    unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                    unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static void __ATTRS_o_ai vec_stvewx(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector unsigned int __a, int __b,
+                                    unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                    unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvewx(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                 vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                 vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                 signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                 unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                 vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector short __a, int __b, vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                 vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                 vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                 unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector pixel __a, int __b, vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                 vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                 unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                 vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector float __a, int __b, vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stl(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                   vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                   signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                   vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                   unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                   signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                   unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                   vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                   vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned short __a, int __b,
+                                   vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned short __a, int __b,
+                                   unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                   unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                   vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b, short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                   unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                   vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector int __a, int __b, vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                   vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                   unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                   unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                   vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                   vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvxl(vector float __a, int __b, float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static vector signed char __ATTRS_o_ai vec_sub(vector signed char __a,
+                                               vector signed char __b) {
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_sub(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_sub(vector signed char __a,
+                                               vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sub(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sub(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_sub(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+static vector short __ATTRS_o_ai vec_sub(vector short __a, vector short __b) {
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                         vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sub(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_sub(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+static vector int __ATTRS_o_ai vec_sub(vector int __a, vector int __b) {
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_sub(vector bool int __a, vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_sub(vector int __a, vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sub(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sub(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sub(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai vec_sub(vector signed __int128 __a,
+                                                   vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_sub(vector signed long long __a, vector signed long long __b) {
+  return __a - __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_sub(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a - __b;
+}
+
+static vector double __ATTRS_o_ai
+vec_sub(vector double __a, vector double __b) {
+  return __a - __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_sub(vector float __a, vector float __b) {
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static vector signed char __ATTRS_o_ai vec_vsububm(vector signed char __a,
+                                                   vector signed char __b) {
+  return __a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vsububm(vector bool char __a,
+                                                   vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vsububm(vector signed char __a,
+                                                   vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububm(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububm(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububm(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                             vector short __b) {
+  return __a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_vsubuhm(vector bool short __a,
+                                             vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                             vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsubuhm(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static vector int __ATTRS_o_ai vec_vsubuwm(vector int __a, vector int __b) {
+  return __a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                           vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                           vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuwm(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b) {
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static vector unsigned int __ATTRS_o_ai vec_subc(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector unsigned __int128 __ATTRS_o_ai
+vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static vector signed __int128 __ATTRS_o_ai
+vec_subc(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vsubcuw */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static vector signed char __ATTRS_o_ai vec_subs(vector signed char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_subs(vector bool char __a,
+                                                vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_subs(vector signed char __a,
+                                                vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_subs(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_subs(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_subs(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static vector short __ATTRS_o_ai vec_subs(vector short __a, vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                          vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                          vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_subs(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_subs(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static vector int __ATTRS_o_ai vec_subs(vector int __a, vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_subs(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_subs(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_subs(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_subs(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_subs(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static vector signed char __ATTRS_o_ai vec_vsubsbs(vector signed char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsubsbs(vector bool char __a,
+                                                   vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vsubsbs(vector signed char __a,
+                                                   vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static vector unsigned char __ATTRS_o_ai vec_vsububs(vector unsigned char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububs(vector bool char __a,
+                                                     vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vsububs(vector unsigned char __a,
+                                                     vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vsubshs(vector bool short __a,
+                                             vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                             vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vsubuhs(vector unsigned short __a,
+                                                      vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static vector int __ATTRS_o_ai vec_vsubsws(vector int __a, vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vsubsws(vector bool int __a,
+                                           vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                           vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector unsigned int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector bool int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vsubuws(vector unsigned int __a,
+                                                    vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vsubuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+
+/* vec_vsubeuqm */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+/* vec_vsubcuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+/* vec_vsubecuq */
+
+static vector signed __int128 __ATTRS_o_ai
+vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_sum4s */
+
+static vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
+                                         vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai vec_sum4s(vector unsigned char __a,
+                                                  vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static vector int __ATTRS_o_ai vec_sum4s(vector signed short __a,
+                                         vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static vector float __ATTRS_o_ai
+vec_trunc(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspiz(__a);
+#else
+  return __builtin_altivec_vrfiz(__a);
+#endif
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_trunc(vector double __a) {
+  return __builtin_vsx_xvrdpiz(__a);
+}
+#endif
+
+/* vec_vrfiz */
+
+static vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a) {
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static vector short __ATTRS_o_ai vec_unpackh(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_unpackh(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_unpackh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_unpackh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_unpackh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_unpackh(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_unpackh(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vupkhsb */
+
+static vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_vupkhsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static vector int __ATTRS_o_ai vec_vupkhsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_vupkhsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vupkhsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsw */
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_vupkhsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_vupkhsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_unpackl */
+
+static vector short __ATTRS_o_ai vec_unpackl(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_unpackl(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector int __ATTRS_o_ai vec_unpackl(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_unpackl(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_unpackl(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_unpackl(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_unpackl(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vupklsb */
+
+static vector short __ATTRS_o_ai vec_vupklsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static vector bool short __ATTRS_o_ai vec_vupklsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static vector int __ATTRS_o_ai vec_vupklsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static vector bool int __ATTRS_o_ai vec_vupklsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vupklsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsw */
+
+#ifdef __POWER8_VECTOR__
+static vector long long __ATTRS_o_ai vec_vupklsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static vector bool long long __ATTRS_o_ai vec_vupklsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vsx_ld */
+
+#ifdef __VSX__
+
+static vector signed int __ATTRS_o_ai vec_vsx_ld(int __a,
+                                                 const vector signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector float __ATTRS_o_ai vec_vsx_ld(int __a, const vector float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed long long *__b) {
+  return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned long long *__b) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static vector double __ATTRS_o_ai vec_vsx_ld(int __a,
+                                             const vector double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+#endif
+
+/* vec_vsx_st */
+
+#ifdef __VSX__
+
+static void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                    vector signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                    vector unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                    vector float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector signed long long __a, int __b,
+                                    vector signed long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector unsigned long long __a, int __b,
+                                    vector unsigned long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                    vector double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+#endif
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static vector signed char __ATTRS_o_ai vec_xor(vector signed char __a,
+                                               vector signed char __b) {
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                               vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_xor(vector signed char __a,
+                                               vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
+                                                 vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                                 vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a,
+                                                 vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                             vector bool char __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_xor(vector short __a, vector short __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                         vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                         vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_xor(vector unsigned short __a,
+                                                  vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                                  vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_xor(vector unsigned short __a,
+                                                  vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                              vector bool short __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_xor(vector int __a, vector int __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_xor(vector bool int __a, vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_xor(vector int __a, vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_xor(vector unsigned int __a,
+                                                vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_xor(vector unsigned int __a,
+                                                vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                            vector bool int __b) {
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai vec_xor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_xor(vector bool int __a,
+                                         vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                         vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai vec_xor(vector signed long long __a,
+                                                    vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_xor(vector bool long long __a,
+                                                  vector bool long long __b) {
+  return __a ^ __b;
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                          (vector unsigned long long)__b);
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long) __b);
+}
+
+static vector double __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+#endif
+
+/* vec_vxor */
+
+static vector signed char __ATTRS_o_ai vec_vxor(vector signed char __a,
+                                                vector signed char __b) {
+  return __a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static vector signed char __ATTRS_o_ai vec_vxor(vector signed char __a,
+                                                vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vxor(vector unsigned char __a,
+                                                  vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                  vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_vxor(vector unsigned char __a,
+                                                  vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static vector bool char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                              vector bool char __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_vxor(vector short __a, vector short __b) {
+  return __a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                          vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                          vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vxor(vector unsigned short __a,
+                                                   vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                                   vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_vxor(vector unsigned short __a,
+                                                   vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static vector bool short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                               vector bool short __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_vxor(vector int __a, vector int __b) {
+  return __a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_vxor(vector bool int __a, vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static vector int __ATTRS_o_ai vec_vxor(vector int __a, vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vxor(vector unsigned int __a,
+                                                 vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                 vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_vxor(vector unsigned int __a,
+                                                 vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static vector bool int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                             vector bool int __b) {
+  return __a ^ __b;
+}
+
+static vector float __ATTRS_o_ai vec_vxor(vector float __a, vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                          vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                          vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static vector bool long long __ATTRS_o_ai vec_vxor(vector bool long long __a,
+                                                   vector bool long long __b) {
+  return __a ^ __b;
+}
+#endif
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static signed char __ATTRS_o_ai vec_extract(vector signed char __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai vec_extract(vector unsigned char __a,
+                                              int __b) {
+  return __a[__b];
+}
+
+static unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
+                                              int __b) {
+  return __a[__b];
+}
+
+static signed short __ATTRS_o_ai vec_extract(vector signed short __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai vec_extract(vector unsigned short __a,
+                                               int __b) {
+  return __a[__b];
+}
+
+static unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
+                                               int __b) {
+  return __a[__b];
+}
+
+static signed int __ATTRS_o_ai vec_extract(vector signed int __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned int __ATTRS_o_ai vec_extract(vector bool int __a, int __b) {
+  return __a[__b];
+}
+
+#ifdef __VSX__
+static signed long long __ATTRS_o_ai vec_extract(vector signed long long __a,
+                                                 int __b) {
+  return __a[__b];
+}
+
+static unsigned long long __ATTRS_o_ai
+vec_extract(vector unsigned long long __a, int __b) {
+  return __a[__b];
+}
+
+static unsigned long long __ATTRS_o_ai vec_extract(vector bool long long __a,
+                                                   int __b) {
+  return __a[__b];
+}
+
+static double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
+  return __a[__b];
+}
+#endif
+
+static float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
+  return __a[__b];
+}
+
+/* vec_insert */
+
+static vector signed char __ATTRS_o_ai vec_insert(signed char __a,
+                                                  vector signed char __b,
+                                                  int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                    vector unsigned char __b,
+                                                    int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                vector bool char __b,
+                                                int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector signed short __ATTRS_o_ai vec_insert(signed short __a,
+                                                   vector signed short __b,
+                                                   int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_insert(unsigned short __a,
+                                                     vector unsigned short __b,
+                                                     int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool short __ATTRS_o_ai vec_insert(unsigned short __a,
+                                                 vector bool short __b,
+                                                 int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector signed int __ATTRS_o_ai vec_insert(signed int __a,
+                                                 vector signed int __b,
+                                                 int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                                   vector unsigned int __b,
+                                                   int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                               vector bool int __b,
+                                               int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai
+vec_insert(signed long long __a, vector signed long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static vector bool long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+static vector double __ATTRS_o_ai vec_insert(double __a, vector double __b,
+                                             int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_insert(float __a, vector float __b,
+                                            int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static vector signed char __ATTRS_o_ai vec_lvlx(int __a,
+                                                const signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai vec_lvlx(int __a,
+                                                const vector signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvlx(int __a,
+                                                  const unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvlx(int __a,
+                                              const vector bool char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlx(int __a, const short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlx(int __a, const vector short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvlx(int __a,
+                                                   const unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvlx(int __a,
+                                               const vector bool short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvlx(int __a, const vector pixel *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlx(int __a, const int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlx(int __a, const vector int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvlx(int __a,
+                                                 const unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvlx(int __a,
+                                             const vector bool int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlx(int __a, const float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlx(int __a, const vector float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static vector signed char __ATTRS_o_ai vec_lvlxl(int __a,
+                                                 const signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvlxl(int __a,
+                                                   const unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvlxl(int __a,
+                                               const vector bool char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlxl(int __a, const short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvlxl(int __a, const vector short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                    const unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                const vector bool short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvlxl(int __a, const vector pixel *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlxl(int __a, const int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvlxl(int __a, const vector int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvlxl(int __a,
+                                                  const unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvlxl(int __a,
+                                              const vector bool int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlxl(int __a, const float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvlxl(int __a, vector float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static vector signed char __ATTRS_o_ai vec_lvrx(int __a,
+                                                const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai vec_lvrx(int __a,
+                                                const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvrx(int __a,
+                                                  const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvrx(int __a,
+                                              const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrx(int __a, const short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrx(int __a, const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvrx(int __a,
+                                                   const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvrx(int __a,
+                                               const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvrx(int __a, const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrx(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrx(int __a, const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvrx(int __a,
+                                                 const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvrx(int __a,
+                                             const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrx(int __a, const float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrx(int __a, const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static vector signed char __ATTRS_o_ai vec_lvrxl(int __a,
+                                                 const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned char __ATTRS_o_ai vec_lvrxl(int __a,
+                                                   const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool char __ATTRS_o_ai vec_lvrxl(int __a,
+                                               const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrxl(int __a, const short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector short __ATTRS_o_ai vec_lvrxl(int __a, const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                    const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector pixel __ATTRS_o_ai vec_lvrxl(int __a, const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrxl(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector int __ATTRS_o_ai vec_lvrxl(int __a, const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector unsigned int __ATTRS_o_ai vec_lvrxl(int __a,
+                                                  const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector bool int __ATTRS_o_ai vec_lvrxl(int __a,
+                                              const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrxl(int __a, const float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static vector float __ATTRS_o_ai vec_lvrxl(int __a, const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                   signed char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                   vector signed char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                   unsigned char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                   vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector bool char __a, int __b,
+                                   vector bool char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector short __a, int __b, short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                   vector short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned short __a, int __b,
+                                   unsigned short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned short __a, int __b,
+                                   vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector bool short __a, int __b,
+                                   vector bool short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector pixel __a, int __b,
+                                   vector pixel *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector int __a, int __b, int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector int __a, int __b, vector int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                   unsigned int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                   vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector bool int __a, int __b,
+                                   vector bool int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlx(vector float __a, int __b,
+                                   vector float *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvlxl */
+
+static void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                    signed char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                    vector signed char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a, int __b,
+                                    unsigned char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a, int __b,
+                                    vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector bool char __a, int __b,
+                                    vector bool char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b, short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                    vector short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a, int __b,
+                                    unsigned short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a, int __b,
+                                    vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector bool short __a, int __b,
+                                    vector bool short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector pixel __a, int __b,
+                                    vector pixel *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b, int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b, vector int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                    unsigned int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                    vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector bool int __a, int __b,
+                                    vector bool int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvlxl(vector float __a, int __b,
+                                    vector float *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrx */
+
+static void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                   signed char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                   vector signed char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                   unsigned char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                   vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector bool char __a, int __b,
+                                   vector bool char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector short __a, int __b, short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                   vector short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned short __a, int __b,
+                                   unsigned short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned short __a, int __b,
+                                   vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector bool short __a, int __b,
+                                   vector bool short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector pixel __a, int __b,
+                                   vector pixel *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector int __a, int __b, int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector int __a, int __b, vector int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                   unsigned int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                   vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector bool int __a, int __b,
+                                   vector bool int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrx(vector float __a, int __b,
+                                   vector float *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrxl */
+
+static void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                    signed char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                    vector signed char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a, int __b,
+                                    unsigned char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a, int __b,
+                                    vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector bool char __a, int __b,
+                                    vector bool char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b, short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                    vector short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a, int __b,
+                                    unsigned short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a, int __b,
+                                    vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector bool short __a, int __b,
+                                    vector bool short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector pixel __a, int __b,
+                                    vector pixel *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b, int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b, vector int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                    unsigned int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                    vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector bool int __a, int __b,
+                                    vector bool int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,
+                                    vector float *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_promote */
+
+static vector signed char __ATTRS_o_ai vec_promote(signed char __a, int __b) {
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned char __ATTRS_o_ai vec_promote(unsigned char __a,
+                                                     int __b) {
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned short __ATTRS_o_ai vec_promote(unsigned short __a,
+                                                      int __b) {
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a, int __b) {
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static vector signed char __ATTRS_o_ai vec_splats(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+static vector unsigned char __ATTRS_o_ai vec_splats(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+static vector short __ATTRS_o_ai vec_splats(short __a) {
+  return (vector short)(__a);
+}
+
+static vector unsigned short __ATTRS_o_ai vec_splats(unsigned short __a) {
+  return (vector unsigned short)(__a);
+}
+
+static vector int __ATTRS_o_ai vec_splats(int __a) { return (vector int)(__a); }
+
+static vector unsigned int __ATTRS_o_ai vec_splats(unsigned int __a) {
+  return (vector unsigned int)(__a);
+}
+
+#ifdef __VSX__
+static vector signed long long __ATTRS_o_ai vec_splats(signed long long __a) {
+  return (vector signed long long)(__a);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+vec_splats(unsigned long long __a) {
+  return (vector unsigned long long)(__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector signed __int128 __ATTRS_o_ai vec_splats(signed __int128 __a) {
+  return (vector signed __int128)(__a);
+}
+
+static vector unsigned __int128 __ATTRS_o_ai
+vec_splats(unsigned __int128 __a) {
+  return (vector unsigned __int128)(__a);
+}
+
+#endif
+
+static vector double __ATTRS_o_ai vec_splats(double __a) {
+  return (vector double)(__a);
+}
+#endif
+
+static vector float __ATTRS_o_ai vec_splats(float __a) {
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                   vector long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_eq(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_eq(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_ge */
+
+static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
+}
+static int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_ge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_ge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_gt */
+
+static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
+}
+static int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_gt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_gt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_in */
+
+static int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_le(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_le(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_lt */
+
+static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_lt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_lt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_nan */
+
+static int __ATTRS_o_ai vec_all_nan(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __a);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_nan(vector double __a) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __a);
+}
+#endif
+
+/* vec_all_ne */
+
+static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_all_ne(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_all_ne(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nge */
+
+static int __ATTRS_o_ai
+vec_all_nge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai
+vec_all_nge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_ngt */
+
+static int __ATTRS_o_ai
+vec_all_ngt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai
+vec_all_ngt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nle */
+
+static int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_eq(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_eq(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_ge */
+
+static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_ge(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_ge(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_gt */
+
+static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_gt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_gt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_le */
+
+static int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_le(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_le(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_lt */
+
+static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_lt(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_lt(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_nan */
+
+static int __attribute__((__always_inline__)) vec_any_nan(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                   vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                   vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                   vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool char __a, vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector short __a, vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool short __a, vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                   vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                   vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector pixel __a, vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                   vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                   vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool int __a, vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                   vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                   vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                   vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static int __ATTRS_o_ai vec_any_ne(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static int __ATTRS_o_ai vec_any_ne(vector double __a, vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
+/* vec_any_nge */
+
+static int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* Power 8 Crypto functions
+Note: We diverge from the current GCC implementation with regard
+to cryptography and related functions as follows:
+- Only the SHA and AES instructions and builtins are disabled by -mno-crypto
+- The remaining ones are only available on Power8 and up so
+  require -mpower8-vector
+The justification for this is that export requirements require that
+Category:Vector.Crypto is optional (i.e. compliant hardware may not provide
+support). As a result, we need to be able to turn off support for those.
+The remaining ones (currently controlled by -mcrypto for GCC) still
+need to be provided on compliant hardware even if Vector.Crypto is not
+provided.
+*/
+#ifdef __CRYPTO__
+#define vec_sbox_be __builtin_altivec_crypto_vsbox
+#define vec_cipher_be __builtin_altivec_crypto_vcipher
+#define vec_cipherlast_be __builtin_altivec_crypto_vcipherlast
+#define vec_ncipher_be __builtin_altivec_crypto_vncipher
+#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vsbox(vector unsigned long long __a) {
+  return __builtin_altivec_crypto_vsbox(__a);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipher(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast(vector unsigned long long __a,
+                             vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipherlast(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipher(vector unsigned long long __a,
+                          vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipher(__a, __b);
+}
+
+static vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast(vector unsigned long long __a,
+                              vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipherlast(__a, __b);
+}
+
+#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
+#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+
+#define vec_shasigma_be(X, Y, Z) \
+  _Generic((X), vector unsigned int: __builtin_crypto_vshasigmaw, \
+                vector unsigned long long: __builtin_crypto_vshasigmad) \
+((X), (Y), (Z))
+#endif
+
+#ifdef __POWER8_VECTOR__
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
+                          vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned short __a, vector unsigned short __b,
+                          vector unsigned short __c) {
+  return (vector unsigned short)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static vector unsigned int __ATTRS_o_ai __builtin_crypto_vpermxor(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (vector unsigned int)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static vector unsigned long long __ATTRS_o_ai __builtin_crypto_vpermxor(
+    vector unsigned long long __a, vector unsigned long long __b,
+    vector unsigned long long __c) {
+  return (vector unsigned long long)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_crypto_vpmsumb(__a, __b);
+}
+
+static vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_crypto_vpmsumh(__a, __b);
+}
+
+static vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_crypto_vpmsumw(__a, __b);
+}
+
+static vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vpmsumd(__a, __b);
+}
+
+static vector signed char __ATTRS_o_ai vec_vgbbd (vector signed char __a)
+{
+  return __builtin_altivec_vgbbd((vector unsigned char) __a);
+}
+
+#define vec_pmsum_be __builtin_crypto_vpmsumb
+#define vec_gb __builtin_altivec_vgbbd
+
+static vector unsigned char __ATTRS_o_ai vec_vgbbd (vector unsigned char __a)
+{
+  return __builtin_altivec_vgbbd(__a);
+}
+
+static vector long long __ATTRS_o_ai
+vec_vbpermq (vector signed char __a, vector signed char __b)
+{
+  return __builtin_altivec_vbpermq((vector unsigned char) __a,
+                                   (vector unsigned char) __b);
+}
+
+static vector long long __ATTRS_o_ai
+vec_vbpermq (vector unsigned char __a, vector unsigned char __b)
+{
+  return __builtin_altivec_vbpermq(__a, __b);
+}
+
+#ifdef __powerpc64__
+static vector unsigned long long __attribute__((__always_inline__))
+vec_bperm (vector unsigned __int128 __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char) __a,
+                                   (vector unsigned char) __b);
+}
+#endif
+#endif
+
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */

diff --git a/25.0.2/clang-include/ammintrin.h b/25.0.2/clang-include/ammintrin.h
new file mode 100644
index 0000000..4880fd7
--- /dev/null
+++ b/25.0.2/clang-include/ammintrin.h

@@ -0,0 +1,209 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index idx and of the length len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// \code
+/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// \endcode
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than
+///    64, the result is undefined. If the length and index are both zero,
+///    bits [63:0] of parameter x are extracted. If the length is zero
+///    but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+///    extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// \endcode
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8]
+///    and the length at [5:0]; all other bits are ignored.
+///    If bits [5:0] are zero, the length is interpreted as 64.
+///    If the sum of the index and length is greater than 64, the result is
+///    undefined. If the length and index are both zero, bits [63:0] of
+///    parameter __x are extracted. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+///    from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    y into the lower 64 bits of the destination integer vector x at the
+///    index idx and of the length len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// \code
+/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// \endcode
+///
+/// \param x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length len and by the index idx specifying the least
+///    significant bit.
+/// \param y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand y of length len.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than
+///    64, the result is undefined. If the length and index are both zero,
+///    bits [63:0] of parameter y are inserted into parameter x. If the
+///    length is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits
+///    of destination operand x with the specified bitfields replaced by the
+///    lower bits of source operand y. The upper 64 bits of the return value
+///    are undefined.
+
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    __y into the lower 64 bits of the destination integer vector __x at
+///    the index and of the length specified by __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// \endcode
+///
+/// \param __x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
+///    specified by operand __y.
+/// \param __y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand __y with length specified
+///    by bits [69:64]. These are inserted into the destination at the index
+///    specified by bits [77:72]; all other bits are ignored.
+///    If bits [69:64] are zero, the length is interpreted as 64.
+///    If the sum of the index and length is greater than 64, the result is
+///    undefined. If the length and index are both zero, bits [63:0] of
+///    parameter __y are inserted into parameter __x. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits
+///    of destination operand __x with the specified bitfields replaced by the
+///    lower bits of source operand __y. The upper 64 bits of the return value
+///    are undefined.
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c MOVNTSD instruction.
+/// \endcode
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to
+///    be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// This intrinsic corresponds to the \c MOVNTSS instruction.
+/// \endcode
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to
+///    be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */

diff --git a/25.0.2/clang-include/arm_acle.h b/25.0.2/clang-include/arm_acle.h
new file mode 100644
index 0000000..4be1d09
--- /dev/null
+++ b/25.0.2/clang-include/arm_acle.h

@@ -0,0 +1,308 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 8.3 Memory barriers */
+#if !defined(_MSC_VER)
+#define __dmb(i) __builtin_arm_dmb(i)
+#define __dsb(i) __builtin_arm_dsb(i)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 8.4 Hints */
+
+#if !defined(_MSC_VER)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+/* 8.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __swp(uint32_t x, volatile uint32_t *p) {
+  uint32_t v;
+  do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
+  return v;
+}
+
+/* 8.6 Memory prefetch intrinsics */
+/* 8.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 8.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 8.7 NOP */
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+
+/* 9 DATA-PROCESSING INTRINSICS */
+/* 9.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __ror(uint32_t x, uint32_t y) {
+  y %= 32;
+  if (y == 0)  return x;
+  return (x >> y) | (x << (32 - y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rorll(uint64_t x, uint32_t y) {
+  y %= 64;
+  if (y == 0)  return x;
+  return (x >> y) | (x << (64 - y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rorl(unsigned long x, uint32_t y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(x, y);
+#else
+  return __rorll(x, y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __clz(uint32_t t) {
+  return __builtin_clz(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __clzl(unsigned long t) {
+  return __builtin_clzl(t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __clzll(uint64_t t) {
+  return __builtin_clzll(t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rev(uint32_t t) {
+  return __builtin_bswap32(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __revl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(t);
+#else
+  return __builtin_bswap64(t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __revll(uint64_t t) {
+  return __builtin_bswap64(t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rev16(uint32_t t) {
+  return __ror(__rev(t), 16);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rev16ll(uint64_t t) {
+  return (((uint64_t)__rev16(t >> 32)) << 32) | __rev16(t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rev16l(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(t);
+#else
+    return __rev16ll(t);
+#endif
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+  __revsh(int16_t t) {
+  return __builtin_bswap16(t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __rbit(uint32_t t) {
+  return __builtin_arm_rbit(t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+  __rbitll(uint64_t t) {
+#if __ARM_32BIT_STATE
+  return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
+    __builtin_arm_rbit(t >> 32);
+#else
+  return __builtin_arm_rbit64(t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+  __rbitl(unsigned long t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(t);
+#else
+  return __rbitll(t);
+#endif
+}
+
+/*
+ * 9.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 9.4.1 Width-specified saturation intrinsics */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 9.4.2 Saturating addition and subtraction intrinsics */
+#if __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+  __qadd(int32_t t, int32_t v) {
+  return __builtin_arm_qadd(t, v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+  __qsub(int32_t t, int32_t v) {
+  return __builtin_arm_qsub(t, v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t t) {
+  return __builtin_arm_qadd(t, t);
+}
+#endif
+
+/* 9.7 CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32b(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32b(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32h(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32h(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32w(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32w(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32d(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32d(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cb(uint32_t a, uint8_t b) {
+  return __builtin_arm_crc32cb(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32ch(uint32_t a, uint16_t b) {
+  return __builtin_arm_crc32ch(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cw(uint32_t a, uint32_t b) {
+  return __builtin_arm_crc32cw(a, b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+  __crc32cd(uint32_t a, uint64_t b) {
+  return __builtin_arm_crc32cd(a, b);
+}
+#endif
+
+/* 10.1 Special register intrinsics */
+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */

diff --git a/25.0.2/clang-include/avx2intrin.h b/25.0.2/clang-include/avx2intrin.h
new file mode 100644
index 0000000..f786572
--- /dev/null
+++ b/25.0.2/clang-include/avx2intrin.h

@@ -0,0 +1,1215 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a + (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a + (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a + (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({        \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                     (__v32qi)(__m256i)(b), (n)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({       \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
+                                   (__v16hi)(__m256i)(V2),   \
+                                   (((M) & 0x01) ? 16 : 0),  \
+                                   (((M) & 0x02) ? 17 : 1),  \
+                                   (((M) & 0x04) ? 18 : 2),  \
+                                   (((M) & 0x08) ? 19 : 3),  \
+                                   (((M) & 0x10) ? 20 : 4),  \
+                                   (((M) & 0x20) ? 21 : 5),  \
+                                   (((M) & 0x40) ? 22 : 6),  \
+                                   (((M) & 0x80) ? 23 : 7),  \
+                                   (((M) & 0x01) ? 24 : 8),  \
+                                   (((M) & 0x02) ? 25 : 9),  \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a == __b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  /* This function always performs a signed comparison, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m256i)((__v32qs)__a > (__v32qs)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)(__a > __b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a * (__v16hi)__b);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a * (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
+                                   (__v8si)_mm256_setzero_si256(), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_setzero_si256(), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) & 0x03) >> 0), \
+                                   12 + (((imm) & 0x0c) >> 2), \
+                                   12 + (((imm) & 0x30) >> 4), \
+                                   12 + (((imm) & 0xc0) >> 6)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_setzero_si256(), \
+                                   (imm) & 0x3,((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) & 0x03) >> 0), \
+                                   8 + (((imm) & 0x0c) >> 2), \
+                                   8 + (((imm) & 0x30) >> 4), \
+                                   8 + (((imm) & 0xc0) >> 6), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, count) __extension__ ({ \
+  (__m256i)__builtin_ia32_pslldqi256((__m256i)(a), (count)*8); })
+
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, count) __extension__ ({ \
+  (__m256i)__builtin_ia32_psrldqi256((__m256i)(a), (count)*8); })
+
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256(__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a - (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a - (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a - (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_stream_load_si256(__m256i const *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_broadcastsd_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
+                                   (__v4si)(__m128i)(V2),  \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
+                                   (__v8si)(__m256i)(V2),   \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector(__X, __X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector(__X, __X, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
+                                   (__v4df)_mm256_setzero_pd(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_setzero_si256(), \
+                                   (M) & 0x3, ((M) & 0xc) >> 2, \
+                                   ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
+
+#define _mm256_extracti128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_setzero_si256(), \
+                                   (((M) & 1) ? 2 : 0), \
+                                   (((M) & 1) ? 3 : 1) ); })
+
+#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
+                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+                                   (((M) & 1) ? 0 : 4), \
+                                   (((M) & 1) ? 1 : 5), \
+                                   (((M) & 1) ? 4 : 2), \
+                                   (((M) & 1) ? 5 : 3) ); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di(__X, __Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di(__X, __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di(__X, __Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di(__X, __Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)(__m256)(mask), (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                       (int const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8si)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                       (int const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                             _mm256_setzero_ps(), \
+                                                             _CMP_EQ_OQ), \
+                                       (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                            _mm_setzero_ps()), \
+                                       (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v4si)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                       (int const *)(m), (__v8si)(__m256i)(i), \
+                                       (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v2di)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                       (int const *)(m), (__v4di)(__m256i)(i), \
+                                       (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX2INTRIN_H */

diff --git a/25.0.2/clang-include/avx512bwintrin.h b/25.0.2/clang-include/avx512bwintrin.h
new file mode 100644
index 0000000..f289ed7
--- /dev/null
+++ b/25.0.2/clang-include/avx512bwintrin.h

@@ -0,0 +1,1542 @@
+/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BWINTRIN_H
+#define __AVX512BWINTRIN_H
+
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+
+static  __inline __v64qi __DEFAULT_FN_ATTRS
+_mm512_setzero_qi(void) {
+  return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static  __inline __v32hi __DEFAULT_FN_ATTRS
+_mm512_setzero_hi(void) {
+  return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/* Integer compare */
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qi) __A + (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) __W,
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) _mm512_setzero_qi(),
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qi) __A - (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) __W,
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+             (__v64qi) __B,
+             (__v64qi) _mm512_setzero_qi(),
+             (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hi) __A + (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) _mm512_setzero_hi(),
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hi) __A - (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+             (__v32hi) __B,
+             (__v32hi) _mm512_setzero_hi(),
+             (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hi) __A * (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+         __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+              (__v32hi) __I /* idx */ ,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+              /* idx */ ,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+         __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+      __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)_mm256_setzero_si256(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)__O,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi) _mm256_setzero_si256(),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) __O,
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+         __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+          __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+         __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                 (__v64qi) __B,
+                 (__v64qi) _mm512_setzero_qi(),
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+          __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                 (__v32hi) __B,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), \
+                                         (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), \
+                                         (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), \
+                                          (p), (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), \
+                                          (p), (__mmask64)(m)); })
+
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), \
+                                         (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), \
+                                         (p), (__mmask32)(m)); })
+
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), \
+                                          (p), (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), \
+                                          (p), (__mmask32)(m)); })
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/avx512cdintrin.h b/25.0.2/clang-include/avx512cdintrin.h
new file mode 100644
index 0000000..3894b29
--- /dev/null
+++ b/25.0.2/clang-include/avx512cdintrin.h

@@ -0,0 +1,131 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+               (__v8di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/avx512dqintrin.h b/25.0.2/clang-include/avx512dqintrin.h
new file mode 100644
index 0000000..afee490
--- /dev/null
+++ b/25.0.2/clang-include/avx512dqintrin.h

@@ -0,0 +1,778 @@
+/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512DQINTRIN_H
+#define __AVX512DQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v8di) __A * (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_xor_pd (__m512d __A, __m512d __B) {
+  return (__m512d) ((__v8di) __A ^ (__v8di) __B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_xor_ps (__m512 __A, __m512 __B) {
+  return (__m512) ((__v16si) __A ^ (__v16si) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_or_pd (__m512d __A, __m512d __B) {
+  return (__m512d) ((__v8di) __A | (__v8di) __B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+            (__v8df) __B,
+            (__v8df) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+            (__v8df) __B,
+            (__v8df)
+            _mm512_setzero_pd (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_or_ps (__m512 __A, __m512 __B) {
+  return (__m512) ((__v16si) __A | (__v16si) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_and_pd (__m512d __A, __m512d __B) {
+  return (__m512d) ((__v8di) __A & (__v8di) __B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_and_ps (__m512 __A, __m512 __B) {
+  return (__m512) ((__v16si) __A & (__v16si) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_andnot_pd (__m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+              (__v8df) __B,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+              (__v8df) __B,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+              (__v8df) __B,
+              (__v8df)
+              _mm512_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_andnot_ps (__m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+             (__v16sf) __B,
+             (__v16sf)
+             _mm512_setzero_ps (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+             (__v16sf) __B,
+             (__v16sf) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+             (__v16sf) __B,
+             (__v16sf)
+             _mm512_setzero_ps (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,               \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,                 \
+                (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundpd_epi64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,        \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu64(__A, __R) __extension__ ({               \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,               \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundpd_epu64(__U, __A, __R) __extension__ ({     \
+  (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,                \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi64(__A, __R) __extension__ ({             \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,              \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,                 \
+                (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundps_epi64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,               \
+                (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,                \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundps_epu64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_pd(__A, __R) __extension__ ({          \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,           \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepi64_pd(__W, __U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,                 \
+                (__v8df) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepi64_pd(__U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,             \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) __W,
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ps(__A, __R) __extension__ ({        \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,          \
+               (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepi64_ps(__W, __U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,                  \
+               (__v8sf) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepi64_ps(__U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,              \
+               (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi64(__A, __R) __extension__ ({             \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundpd_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,                 \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundpd_epi64(__U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,             \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epu64(__A, __R) __extension__ ({              \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundpd_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,                \
+                  (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundpd_epu64(__U, __A, __R) __extension__ ({   \
+  (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,              \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi64(__A, __R) __extension__ ({            \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,             \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundps_epi64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,                 \
+                 (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundps_epi64(__U, __A, __R) __extension__ ({  \
+  (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,              \
+                 (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epu64(__A, __R) __extension__ ({            \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,            \
+                  (__v8di) _mm512_setzero_si512(),(__mmask8) -1, __R);})
+
+#define _mm512_mask_cvtt_roundps_epu64(__W, __U, __A, __R) __extension__ ({ \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,                \
+                  (__v8di) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvtt_roundps_epu64(__U, __A, __R) __extension__ ({  \
+  (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,             \
+                  (__v8di) _mm512_setzero_si512(), (__mmask8) __U, __R);})
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_pd(__A, __R) __extension__ ({          \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,          \
+                 (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepu64_pd(__W, __U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,                \
+                 (__v8df) __W, (__mmask8) __U, __R);})
+
+
+#define _mm512_maskz_cvt_roundepu64_pd(__U, __A, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,            \
+                 (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ps(__A, __R) __extension__ ({         \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,          \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_cvt_roundepu64_ps(__W, __U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,                 \
+                (__v8sf) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_cvt_roundepu64_ps(__U, __A, __R) __extension__ ({ \
+  (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,             \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U, __R);})
+
+#define _mm512_range_pd(__A, __B, __C) __extension__ ({                     \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({      \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,\
+               (__v8df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) __U,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_range_round_pd(__A, __B, __C, __R) __extension__ ({           \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_range_round_pd(__W, __U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,      \
+               (__v8df) __W, (__mmask8) __U, __R);})
+
+#define _mm512_maskz_range_round_pd(__U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, (__v8df) __B, __C,   \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_range_ps(__A, __B, __C) __extension__ ({                       \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B, __C, \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1,                 \
+               _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({         \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,       \
+               __C, (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_range_ps(__U, __A, __B, __C) __extension__ ({      \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,(__v16sf) __B, \
+              __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U,      \
+              _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_range_round_ps(__A, __B, __C, __R) __extension__ ({         \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,   \
+                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+
+#define _mm512_mask_range_round_ps(__W, __U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,          \
+                __C, (__v16sf) __W, (__mmask16) __U, __R);})
+
+#define _mm512_maskz_range_round_ps(__U, __A, __B, __C, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, (__v16sf) __B,      \
+                __C, (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+
+#define _mm512_reduce_pd(__A, __B) __extension__ ({             \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
+                (__v8df) __W,(__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_reduce_pd(__U, __A, __B) __extension__ ({  \
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_reduce_ps(__A, __B) __extension__ ({              \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({   \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) __W, (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_maskz_reduce_ps(__U, __A, __B) __extension__ ({       \
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, _MM_FROUND_CUR_DIRECTION);})
+
+#define _mm512_reduce_round_pd(__A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R);})
+
+#define _mm512_mask_reduce_round_pd(__W, __U, __A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,    \
+                (__v8df) __W,(__mmask8) __U, __R);})
+
+#define _mm512_maskz_reduce_round_pd(__U, __A, __B, __R) __extension__ ({\
+  (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_reduce_round_ps(__A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,  \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R);})
+
+#define _mm512_mask_reduce_round_ps(__W, __U, __A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) __W, (__mmask16) __U, __R);})
+
+#define _mm512_maskz_reduce_round_ps(__U, __A, __B, __R) __extension__ ({\
+  (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,      \
+               (__v16sf) _mm512_setzero_ps(), (__mmask16) __U, __R);})
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/avx512erintrin.h b/25.0.2/clang-include/avx512erintrin.h
new file mode 100644
index 0000000..40a9121
--- /dev/null
+++ b/25.0.2/clang-include/avx512erintrin.h

@@ -0,0 +1,285 @@
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(S), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_pd(A) \
+   _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+   _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+   _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask8)-1, (R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(S), \
+                                     (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask8)(M), (R)); })
+
+#define _mm512_exp2a23_ps(A) \
+   _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+   _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+   _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+// rsqrt28
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(S), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(S), \
+                                        (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(M), (R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(S), \
+                                        (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)-1, (R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(S), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)(M), (R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+// rcp28
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), \
+                                      (__mmask16)(M), (R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (R)); })
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)_mm_setzero_ps(), \
+                                      (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)(__m128)(S), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \
+                                      (__v4sf)(__m128)(B), \
+                                      (__v4sf)_mm_setzero_ps(), \
+                                      (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)_mm_setzero_pd(), \
+                                       (__mmask8)-1, (R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)(__m128d)(S), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \
+                                       (__v2df)(__m128d)(B), \
+                                       (__v2df)_mm_setzero_pd(), \
+                                       (__mmask8)(M), (R)); })
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif // __AVX512ERINTRIN_H

diff --git a/25.0.2/clang-include/avx512fintrin.h b/25.0.2/clang-include/avx512fintrin.h
new file mode 100644
index 0000000..8dcdc71
--- /dev/null
+++ b/25.0.2/clang-include/avx512fintrin.h

@@ -0,0 +1,3074 @@
+/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FINTRIN_H
+#define __AVX512FINTRIN_H
+
+typedef double __v8df __attribute__((__vector_size__(64)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef long long __v8di __attribute__((__vector_size__(64)));
+typedef int __v16si __attribute__((__vector_size__(64)));
+
+typedef float __m512 __attribute__((__vector_size__(64)));
+typedef double __m512d __attribute__((__vector_size__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+/* Rounding mode macros.  */
+#define _MM_FROUND_TO_NEAREST_INT   0x00
+#define _MM_FROUND_TO_NEG_INF       0x01
+#define _MM_FROUND_TO_POS_INF       0x02
+#define _MM_FROUND_TO_ZERO          0x03
+#define _MM_FROUND_CUR_DIRECTION    0x04
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+
+/* Create vectors with repeated elements */
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_si512(void)
+{
+  return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_undefined_pd()
+{
+  return (__m512d)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined()
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined_ps()
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_undefined_epi32()
+{
+  return (__m512i)__builtin_ia32_undef512();
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+#ifdef __x86_64__
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#endif
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_setzero_ps(void)
+{
+  return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+static  __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_setzero_pd(void)
+{
+  return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set1_ps(float __w)
+{
+  return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+                   __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set1_pd(double __w)
+{
+  return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi32(int __s)
+{
+  return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
+                             __s, __s, __s, __s, __s, __s, __s, __s };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi64(long long __d)
+{
+  return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcastss_ps(__m128 __X)
+{
+  float __f = __X[0];
+  return (__v16sf){ __f, __f, __f, __f,
+                    __f, __f, __f, __f,
+                    __f, __f, __f, __f,
+                    __f, __f, __f, __f };
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcastsd_pd(__m128d __X)
+{
+  double __d = __X[0];
+  return (__v8df){ __d, __d, __d, __d,
+                   __d, __d, __d, __d };
+}
+
+/* Cast between vector types */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd128(__m512d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps128(__m512 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+/* Bitwise operators */
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W, __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_pd (),
+              __U);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si) __src,
+              (__mmask16) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a,
+              (__v16si) __b,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di) __src,
+              (__mmask8) __k);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a,
+              (__v8di) __b,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __k);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+  return __a ^ __b;
+}
+/* Arithmetic */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_add_pd(__m512d __a, __m512d __b)
+{
+  return __a + __b;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_add_ps(__m512 __a, __m512 __b)
+{
+  return __a + __b;
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mul_pd(__m512d __a, __m512d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mul_ps(__m512 __a, __m512 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_sub_pd(__m512d __a, __m512d __b)
+{
+  return __a - __b;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_sub_ps(__m512 __a, __m512 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8di) __A + (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8di) __A - (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+             (__v8di) __B,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16si) __A + (__v16si) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16si) __A - (__v16si) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+             (__v16si) __B,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_max_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_max_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_max_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_max_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_max_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_max_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_max_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_min_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_min_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_min_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_min_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_minss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_min_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_min_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_min_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di) __W, __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+              (__v16si) __Y,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epu32(__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di)
+               _mm512_setzero_si512 (),
+               (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di) __W, __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+               (__v16si) __Y,
+               (__v8di)
+               _mm512_setzero_si512 (),
+               __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16si) __A * (__v16si) __B);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W, __M);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_sqrt_pd(__m512d __a)
+{
+  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
+                                                (__v8df) _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_sqrt_ps(__m512 __a)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                 (__v8df)
+                 _mm512_setzero_pd (),
+                 (__mmask8) -1);}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1);
+}
+
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) -1);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rcp14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+               (__v8df)
+               _mm512_setzero_pd (),
+               (__mmask8) -1);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rcp14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+              (__v16sf)
+              _mm512_setzero_ps (),
+              (__mmask16) -1);
+}
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) -1);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_floor_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_floor_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_FLOOR,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_ceil_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_ceil_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_CEIL,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi64(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi32(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_add_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_add_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_add_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_addss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_add_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_add_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_add_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_add_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
+               (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_add_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask((__v8df) __A, (__v8df) __B, \
+                (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_add_round_pd(__U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, (__v8df) __B, \
+                (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R); })
+
+#define _mm512_add_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) _mm512_setzero_ps(), (__mmask16) -1, __R); })
+
+#define _mm512_mask_add_round_ps(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) __W, (__mmask16)__U, __R); })
+
+#define _mm512_maskz_add_round_ps(__U, __A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, (__v16sf) __B, \
+                (__v16sf) _mm512_setzero_ps(), (__mmask16)__U, __R); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_sub_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_sub_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_sub_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_subss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_sub_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_sub_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_sub_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_sub_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_sub_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_sub_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_sub_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_sub_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_sub_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_mul_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_mul_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_mul_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mul_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_mul_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_mul_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mul_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_mul_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_mul_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_mul_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_mul_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_mul_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_ss(__A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1, __R); })
+
+#define _mm_mask_div_round_ss(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_div_round_ss(__U, __A, __B, __R) __extension__ ({ \
+  (__m128) __builtin_ia32_divss_round ((__v4sf) __A, (__v4sf) __B, \
+                (__v4sf)  _mm_setzero_ps(), (__mmask8) __U,__R); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_sd(__A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm_mask_div_round_sd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  __W, (__mmask8) __U,__R); })
+
+#define _mm_maskz_div_round_sd(__U, __A, __B, __R) __extension__ ({ \
+  (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, (__v2df) __B, \
+                (__v2df)  _mm_setzero_pd(), (__mmask8) __U,__R); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df) __W,
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) __U,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf) __W,
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) __U,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_div_round_pd(__A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B,\
+             (__v8df) _mm512_setzero_pd(), (__mmask8) -1, __R); })
+
+#define _mm512_mask_div_round_pd(__W, __U, __A, __B, __R) __extension__ ({ \
+  (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) __W, (__mmask8) __U, __R); })
+
+#define _mm512_maskz_div_round_pd(__U, __A, __B, __R) __extension__ ({ \
+   (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, (__v8df) __B, \
+             (__v8df) _mm512_setzero_pd(), (__mmask8) __U, __R);})
+
+#define _mm512_div_round_ps(__A, __B, __R) __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) -1, __R);})
+
+#define _mm512_mask_div_round_ps(__W, __U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) __W, (__mmask16) __U, __R); });
+
+#define _mm512_maskz_div_round_ps(__U, __A, __B, __R)  __extension__ ({ \
+  (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, (__v16sf) __B, \
+            (__v16sf) _mm512_setzero_ps (), (__mmask16) __U, __R);});
+
+#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \
+                                         -1, _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \
+                                          -1, _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) (A), \
+                                              (__v8df) (B), -(__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
+                                             (__v8df) (B), (__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) (A), \
+                                             (__v8df) (B), -(__v8df) (C), \
+                                             (__mmask8) -1, (R)); })
+
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) (A), \
+                                              (__v8df) (B), -(__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) (A), \
+                                             (__v16sf) (B), -(__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
+                                            (__v16sf) (B), (__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) (A), \
+                                            (__v16sf) (B), -(__v16sf) (C), \
+                                            (__mmask16) -1, (R)); })
+
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) (A), \
+                                             (__v16sf) (B), -(__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), (__v8df) (C), \
+                                                (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), (__v8df) (C), \
+                                                (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), -(__v8df) (C), \
+                                                (__mmask8) -1, (R)); })
+
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) (A), \
+                                                (__v8df) (B), -(__v8df) (C), \
+                                                (__mmask8) (U), (R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) (A), \
+                                                 (__v8df) (B), -(__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        -(__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), (__v16sf) (C), \
+                                               (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), (__v16sf) (C), \
+                                               (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), -(__v16sf) (C), \
+                                               (__mmask16) -1, (R)); })
+
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) (A), \
+                                               (__v16sf) (B), -(__v16sf) (C), \
+                                               (__mmask16) (U), (R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) (A), \
+                                                (__v16sf) (B), -(__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       -(__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) (A), \
+                                                 (__v8df) (B), (__v8df) (C), \
+                                                 (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) (A), \
+                                                (__v16sf) (B), (__v16sf) (C), \
+                                                (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) (A), \
+                                              (__v8df) (B), (__v8df) (C), \
+                                              (__mmask8) (U), (R)); })
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) (A), \
+                                               (__v8df) (B), (__v8df) (C), \
+                                               (__mmask8) (U), (R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) (A), \
+                                             (__v16sf) (B), (__v16sf) (C), \
+                                             (__mmask16) (U), (R)); })
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) (A), \
+                                              (__v16sf) (B), (__v16sf) (C), \
+                                              (__mmask16) (U), (R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+
+
+/* Vector permutations */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16si) __A,
+                                                       (__v16si) __B,
+                                                       (__mmask16) -1);
+}
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__mmask8) -1);
+}
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__mmask16) -1);
+}
+
+#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
+                                         (__v8di)(__m512i)(B), \
+                                         (I), (__v8di)_mm512_setzero_si512(), \
+                                         (__mmask8)-1); })
+
+#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
+                                         (__v16si)(__m512i)(B), \
+                                         (I), (__v16si)_mm512_setzero_si512(), \
+                                         (__mmask16)-1); })
+
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({                    \
+      (__m256d)                                                          \
+        __builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A),           \
+                                         (I),                            \
+                                         (__v4df)_mm256_setzero_si256(), \
+                                         (__mmask8) -1); })
+
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({                    \
+      (__m128)                                                           \
+        __builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A),           \
+                                         (I),                            \
+                                         (__v4sf)_mm_setzero_ps(),       \
+                                         (__mmask8) -1); })
+
+/* Vector Blend */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+/* Compare */
+
+#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (P), (__mmask16)-1, (R)); })
+
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (P), (__mmask16)(U), (R)); })
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (P), (__mmask8)-1, (R)); })
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), \
+                                         (P), (__mmask8)(U), (R)); })
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+/* Conversion */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu32(__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (R)); })
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_pd(__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_pd(__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) -1);
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (R)); })
+
+#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(A), (I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            -1); })
+
+static  __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtph_ps(__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi32(__m512 __a)
+{
+  return (__m512i)
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
+                                     (__v16si) _mm512_setzero_si512 (),
+                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi32(__m512d __a)
+{
+  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
+                                                   (__v8si)_mm256_setzero_si256(),
+                                                   (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (R)); })
+
+#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)-1, (R)); })
+
+#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (R)); })
+
+#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8) -1, (R)); })
+
+/* Unpack and Interleave */
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 2,    18,    3,    19,
+                                 2+4,  18+4,  3+4,  19+4,
+                                 2+8,  18+8,  3+8,  19+8,
+                                 2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 0,    16,    1,    17,
+                                 0+4,  16+4,  1+4,  17+4,
+                                 0+8,  16+8,  1+8,  17+8,
+                                 0+12, 16+12, 1+12, 17+12);
+}
+
+/* Bit Test */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_test_epi32_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+            (__v16si) __B,
+            (__mmask16) -1);
+}
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS
+_mm512_test_epi64_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+                 (__v8di) __B,
+                 (__mmask8) -1);
+}
+
+/* SIMD load ops */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *)__P,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *)__P,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m512d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m512 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_load_ps(double const *__p)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_load_pd(float const *__p)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+/* SIMD store ops */
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((__v8di *)__P, (__v8di) __A,
+                                     (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((__v16si *)__P, (__v16si) __A,
+                                     (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_pd(void *__P, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_ps(void *__P, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_pd(void *__P, __m512d __A)
+{
+  *(__m512d*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_ps(void *__P, __m512 __A)
+{
+  *(__m512*)__P = __A;
+}
+
+/* Mask ops */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_knot(__mmask16 __M)
+{
+  return __builtin_ia32_knothi(__M);
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                __u);
+}
+
+#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (p), \
+                                         (__mmask16)-1); })
+
+#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (p), \
+                                          (__mmask16)-1); })
+
+#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (p), \
+                                        (__mmask8)-1); })
+
+#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (p), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (p), \
+                                         (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (p), \
+                                          (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (p), \
+                                        (__mmask8)(m)); })
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (p), \
+                                         (__mmask8)(m)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __AVX512FINTRIN_H

diff --git a/25.0.2/clang-include/avx512vlbwintrin.h b/25.0.2/clang-include/avx512vlbwintrin.h
new file mode 100644
index 0000000..b4542d6
--- /dev/null
+++ b/25.0.2/clang-include/avx512vlbwintrin.h

@@ -0,0 +1,2336 @@
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBWINTRIN_H
+#define __AVX512VLBWINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi)
+             _mm256_setzero_si256 (),
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi) __W,
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+             (__v32qi) __B,
+             (__v32qi)
+             _mm256_setzero_si256 (),
+             (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+             (__v16hi) __B,
+             (__v16hi)
+             _mm256_setzero_si256 (),
+             (__mmask16) __U);
+}
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi)
+             _mm_setzero_si128 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi) __W,
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+             (__v16qi) __B,
+             (__v16qi)
+             _mm_setzero_si128 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+             (__v8hi) __B,
+             (__v8hi)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+              (__v16hi) __B,
+              (__v16hi)
+              _mm256_setzero_si256 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+              (__v8hi) __B,
+              (__v8hi) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+              (__v8hi) __B,
+              (__v8hi)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) _mm_setzero_si128 (), __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) __W,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+               (__v4si) __B,
+               (__v8hi) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+               (__v8si) __B,
+               (__v16hi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v16qi) __W,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v32qi) __W,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256 (),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256 (),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+               (__v8hi) __I /* idx */ ,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+         __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+               (__v16hi) __I /* idx */ ,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+               (__v16qi) __Y,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+               (__v16qi) __Y,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+         __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+               (__v32qi) __Y,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+               (__v32qi) __Y,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v4si) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v8si) _mm256_setzero_si256(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi8 (__m128i __A) {
+
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+               (__v8hi) __Y,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+               (__v8hi) __Y,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+               (__v16hi) __Y,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+               (__v16hi) __Y,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+          __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+       __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+          __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+              (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+       __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+      __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+         __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+          __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+      __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+               (__v16qi) __B,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+         __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+               (__v32qi) __B,
+               (__v32qi) _mm256_setzero_si256(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+       __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+               (__v8hi) __B,
+               (__v8hi) _mm_setzero_si128(),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+          __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+               (__v16hi) __B,
+               (__v16hi) _mm256_setzero_si256(),
+               (__mmask16) __U);
+}
+
+#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), \
+                                         (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), \
+                                         (p), (__mmask16)(m)); })
+
+#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), \
+                                          (p), (__mmask16)-1); })
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), \
+                                          (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), \
+                                         (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), \
+                                         (p), (__mmask32)(m)); })
+
+#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), \
+                                          (p), (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), \
+                                          (p), (__mmask32)(m)); })
+
+#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), \
+                                         (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), \
+                                         (p), (__mmask16)(m)); })
+
+#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), \
+                                          (p), (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), \
+                                          (p), (__mmask16)(m)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLBWINTRIN_H */

diff --git a/25.0.2/clang-include/avx512vldqintrin.h b/25.0.2/clang-include/avx512vldqintrin.h
new file mode 100644
index 0000000..dfd858e
--- /dev/null
+++ b/25.0.2/clang-include/avx512vldqintrin.h

@@ -0,0 +1,953 @@
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLDQINTRIN_H
+#define __AVX512VLDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) ((__v4di) __A * (__v4di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) ((__v2di) __A * (__v2di) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+              (__v4df) __B,
+              (__v4df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+              (__v4df) __B,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+             (__v8sf) __B,
+             (__v8sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+             (__v8sf) __B,
+             (__v8sf)
+             _mm256_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+            (__v4df) __B,
+            (__v4df) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+            (__v4df) __B,
+            (__v4df)
+            _mm256_setzero_pd (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+                 (__v8sf) __B,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+                 (__v8sf) __B,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+#define _mm_range_pd(__A, __B, __C) __extension__ ({                         \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+
+#define _mm_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({          \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) __W, (__mmask8) __U); })
+
+#define _mm_maskz_range_pd(__U, __A, __B, __C) __extension__ ({              \
+  (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, (__v2df) __B, __C, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+
+#define _mm256_range_pd(__A, __B, __C) __extension__ ({                      \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+
+#define _mm256_mask_range_pd(__W, __U, __A, __B, __C) __extension__ ({       \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_range_pd(__U, __A, __B, __C) __extension__ ({           \
+  (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, (__v4df) __B, __C, \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_range_ps(__A, __B, __C) __extension__ ({                         \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+#define _mm_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({          \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) __W, (__mmask8) __U); })
+
+#define _mm_maskz_range_ps(__U, __A, __B, __C) __extension__ ({              \
+  (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, (__v4sf) __B, __C,  \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_range_ps(__A, __B, __C) __extension__ ({                      \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_range_ps(__W, __U, __A, __B, __C) __extension__ ({       \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_range_ps(__U, __A, __B, __C) __extension__ ({           \
+  (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, (__v8sf) __B, __C,  \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+#define _mm_reduce_pd(__A, __B) __extension__ ({                \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) -1); })
+
+#define _mm_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) __W, (__mmask8) __U); })
+
+#define _mm_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
+  (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, \
+                (__v2df) _mm_setzero_pd(), (__mmask8) __U); })
+
+#define _mm256_reduce_pd(__A, __B) __extension__ ({                \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) -1); })
+
+#define _mm256_mask_reduce_pd(__W, __U, __A, __B) __extension__ ({ \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_reduce_pd(__U, __A, __B) __extension__ ({     \
+  (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,    \
+                (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_reduce_ps(__A, __B) __extension__ ({                   \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+#define _mm_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({    \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) __W, (__mmask8) __U); })
+
+#define _mm_maskz_reduce_ps(__U, __A, __B) __extension__ ({        \
+  (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,     \
+                (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_reduce_ps(__A, __B) __extension__ ({                \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_reduce_ps(__W, __U, __A, __B) __extension__ ({ \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) __W, (__mmask8) __U); })
+
+#define _mm256_maskz_reduce_ps(__U, __A, __B) __extension__ ({     \
+  (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,     \
+                (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/avx512vlintrin.h b/25.0.2/clang-include/avx512vlintrin.h
new file mode 100644
index 0000000..8f13536
--- /dev/null
+++ b/25.0.2/clang-include/avx512vlintrin.h

@@ -0,0 +1,4606 @@
+/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLINTRIN_H
+#define __AVX512VLINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
+#define __DEFAULT_FN_ATTRS_BOTH __attribute__((__always_inline__, __nodebug__, __target__("avx512vl, avx512bw")))
+
+/* Integer compare */
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                __u);
+}
+
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS_BOTH
+_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+           __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+              (__v8si) __Y,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+              (__v8si) __Y,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
+        __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+              (__v4si) __Y,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+              (__v4si) __Y,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
+           __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+               (__v8si) __Y,
+               (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+               (__v8si) __Y,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
+        __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+               (__v4si) __Y,
+               (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+               (__v4si) __Y,
+               (__v2di)
+               _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+            (__v8si) __B,
+            (__v8si) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+            (__v8si) __B,
+            (__v8si)
+            _mm256_setzero_si256 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+            (__v4si) __B,
+            (__v4si) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+            (__v4si) __B,
+            (__v4si)
+            _mm_setzero_si128 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+             (__v8si) __B,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+             (__v4si) __B,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W, __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_pd (),
+             __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W, __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_pd (),
+             __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_pd (),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_pd (),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+            (__v4di) __B,
+            (__v4di) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+            (__v4di) __B,
+            (__v4di)
+            _mm256_setzero_si256 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+            (__v2di) __B,
+            (__v2di) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+            (__v2di) __B,
+            (__v2di)
+            _mm_setzero_si128 (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+             (__v4di) __B,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+             (__v2di) __B,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), \
+                                        (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), \
+                                        (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm256_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
+                                         (__v4df)(__m256)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm256_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \
+                                         (__v4df)(__m256)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm128_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm128_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), \
+                                         (p), (__mmask8)(m)); })
+
+#define _mm128_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
+                                         (__v2df)(__m128)(b), \
+                                         (p), (__mmask8)-1); })
+
+#define _mm128_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \
+                                         (__v2df)(__m128)(b), \
+                                         (p), (__mmask8)(m)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    -(__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    -(__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   -(__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       -(__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        -(__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       -(__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        -(__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      -(__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       -(__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      -(__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       -(__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df)
+                  _mm_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df)
+                  _mm256_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
+  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+            (__v2df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
+  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+            (__v4df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+            (__v2di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+            (__v4di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
+  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+            (__v4sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
+  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+            (__v8sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+            (__v4si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+            (__v8si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu32_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_pd (__m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu32_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_ps (__m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+              (__v2df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+               (__v2di)
+               _mm_setzero_si128 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+                   (__v4sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+               (__v4si)
+               _mm_setzero_si128 (),
+               (__mmask8)     __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_pd (__m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_getexp_pd (__m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ps (__m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_getexp_ps (__m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+             (__v4si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+             (__v4si)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+             (__v8si) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+             (__v8si)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi64 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi64 (__m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+#define _mm_roundscale_pd(__A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, \
+                   __imm, (__v2df) _mm_setzero_pd (), (__mmask8) -1); })
+
+
+#define _mm_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
+                   (__v2df) __W, (__mmask8) __U); })
+
+
+#define _mm_maskz_roundscale_pd(__U, __A, __imm) __extension__ ({ \
+  (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, __imm, \
+                   (__v2df) _mm_setzero_pd (), (__mmask8) __U); })
+
+
+#define _mm256_roundscale_pd(__A, __imm) __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) _mm256_setzero_pd (), (__mmask8) -1); })
+
+
+#define _mm256_mask_roundscale_pd(__W, __U, __A, __imm) __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) __W, (__mmask8) __U); })
+
+
+#define _mm256_maskz_roundscale_pd(__U, __A, __imm)  __extension__ ({ \
+  (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, __imm, \
+                   (__v4df) _mm256_setzero_pd(), (__mmask8) __U); })
+
+#define _mm_roundscale_ps(__A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) _mm_setzero_ps(), (__mmask8) -1); })
+
+
+#define _mm_mask_roundscale_ps(__W, __U, __A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) __W, (__mmask8) __U); })
+
+
+#define _mm_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
+  (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, __imm, \
+                  (__v4sf) _mm_setzero_ps(), (__mmask8) __U); })
+
+#define _mm256_roundscale_ps(__A, __imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,__imm, \
+                  (__v8sf) _mm256_setzero_ps(), (__mmask8) -1); })
+
+#define _mm256_mask_roundscale_ps(__W, __U, __A,__imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
+                  (__v8sf) __W, (__mmask8) __U); })
+
+
+#define _mm256_maskz_roundscale_ps(__U, __A, __imm)  __extension__ ({ \
+  (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, __imm, \
+                  (__v8sf) _mm256_setzero_ps(), (__mmask8) __U); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_pd (__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_scalef_pd (__m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ps (__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_scalef_ps (__m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_i64scatter_pd(__addr,__index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df(__addr, (__mmask8) 0xFF, (__v2di) __index, \
+                              (__v2df) __v1, __scale); })
+
+#define _mm_mask_i64scatter_pd(__addr, __mask, __index, __v1, \
+                               __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, \
+                               (__v2df) __v1, __scale); })
+
+
+#define _mm_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, \
+        (__v2di) __index, (__v2di) __v1, __scale); })
+
+#define _mm_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
+                                  __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,\
+        (__v2di) __v1, __scale); })
+
+#define _mm256_i64scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,\
+        (__v4di) __index, (__v4df) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_pd(__addr, __mask, __index, __v1,\
+                                   __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,\
+        (__v4df) __v1, __scale); })
+
+#define _mm256_i64scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, (__v4di) __index,\
+                               (__v4di) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_epi64(__addr, __mask, __index, __v1,\
+                                      __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,\
+        (__v4di) __v1, __scale); })
+
+#define _mm_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,\
+        (__v2di) __index, (__v4sf) __v1, __scale); })
+
+#define _mm_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
+                                __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,\
+        (__v4sf) __v1, __scale); })
+
+#define _mm_i64scatter_epi32(__addr, __index, __v1, \
+                              __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,\
+        (__v2di) __index, (__v4si) __v1, __scale); })
+
+#define _mm_mask_i64scatter_epi32(__addr, __mask, __index, __v1,\
+         __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,\
+        (__v4si) __v1, __scale); })
+
+#define _mm256_i64scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, (__v4di) __index, \
+                              (__v4sf) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_ps(__addr, __mask, __index, __v1, \
+                                   __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, \
+        (__v4sf) __v1, __scale); })
+
+#define _mm256_i64scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, \
+        (__v4di) __index, (__v4si) __v1, __scale); })
+
+#define _mm256_mask_i64scatter_epi32(__addr, __mask, __index, __v1, \
+                                      __scale) __extension__ ({  \
+  __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di) __index, \
+        (__v4si) __v1, __scale); })
+
+#define _mm_i32scatter_pd(__addr, __index, __v1,         \
+                          __scale) __extension__ ({      \
+  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, \
+        (__v4si) __index, (__v2df) __v1, __scale); })
+
+#define _mm_mask_i32scatter_pd(__addr, __mask, __index, __v1,    \
+                                __scale) __extension__ ({        \
+  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,\
+         (__v2df) __v1, __scale); })
+
+#define _mm_i32scatter_epi64(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,                       \
+        (__v4si) __index, (__v2di) __v1, __scale); })
+
+#define _mm_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
+         __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, \
+        (__v2di) __v1, __scale); })
+
+#define _mm256_i32scatter_pd(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,                      \
+        (__v4si) __index, (__v4df) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_pd(__addr, __mask, __index, __v1, \
+         __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, \
+        (__v4df) __v1, __scale); })
+
+#define _mm256_i32scatter_epi64(__addr, __index, __v1,    \
+                                __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,  \
+        (__v4si) __index, (__v4di) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_epi64(__addr, __mask, __index, __v1, \
+            __scale) __extension__ ({                               \
+  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,   \
+        (__v4di) __v1, __scale); })
+
+#define _mm_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,                   \
+        (__v4si) __index, (__v4sf) __v1, __scale); })
+
+#define _mm_mask_i32scatter_ps(__addr, __mask, __index, __v1,     \
+                               __scale) __extension__ ({          \
+  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, \
+        (__v4sf) __v1, __scale); })
+
+#define _mm_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,                       \
+        (__v4si) __index, (__v4si) __v1, __scale); })
+
+#define _mm_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
+                                  __scale) __extension__ ({      \
+  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,\
+        (__v4si) __v1, __scale); })
+
+#define _mm256_i32scatter_ps(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,                      \
+        (__v8si) __index, (__v8sf) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_ps(__addr, __mask, __index, __v1, \
+                                   __scale) __extension__ ({     \
+  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,\
+        (__v8sf) __v1, __scale); })
+
+#define _mm256_i32scatter_epi32(__addr, __index, __v1, __scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,                         \
+        (__v8si) __index, (__v8si) __v1, __scale); })
+
+#define _mm256_mask_i32scatter_epi32(__addr, __mask, __index, __v1, \
+            __scale) __extension__ ({                                \
+  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,    \
+        (__v8si) __v1, __scale); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+              (__v2df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+              (__v4df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+              (__v4df)
+              _mm256_setzero_pd (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+             (__v8sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+             (__v8sf)
+             _mm256_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+             (__v2df) __B,
+             (__v2df)
+             _mm_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
+        __m256d __B) {
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+             (__v4df) __B,
+             (__v4df)
+             _mm256_setzero_pd (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+            (__v4sf) __B,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+            (__v8sf) __B,
+            (__v8sf)
+            _mm256_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+              (__v2di) __I
+              /* idx */ ,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+              (__v4di) __I
+              /* idx */ ,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+                   (__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+                   (__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+               /* idx */ ,
+               (__v2df) __A,
+               (__v2df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+               /* idx */ ,
+               (__v4df) __A,
+               (__v4df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+        __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4sf) __A,
+              (__v4sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8sf) __A,
+              (__v8sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+              /* idx */ ,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8)
+              __U);
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+              /* idx */ ,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8)
+              __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_BOTH
+
+#endif /* __AVX512VLINTRIN_H */

diff --git a/25.0.2/clang-include/avxintrin.h b/25.0.2/clang-include/avxintrin.h
new file mode 100644
index 0000000..6d1ca54
--- /dev/null
+++ b/25.0.2/clang-include/avxintrin.h

@@ -0,0 +1,1318 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v32qs __attribute__((__vector_size__(32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+
+/* Arithmetic */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return __a+__b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return __a-__b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return __a / __b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return __a * __b;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
+
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
+
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4di)__a & (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8si)__a & (__v8si)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a | (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a | (__v8si)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4di)__a ^ (__v4di)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8si)__a ^ (__v8si)__b);
+}
+
+/* Horizontal arithmetic */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
+}
+
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
+                                   (__v2df)_mm_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1); })
+
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
+                                   (__v4df)_mm256_setzero_pd(), \
+                                   (C) & 0x1, ((C) & 0x2) >> 1, \
+                                   2 + (((C) & 0x4) >> 2), \
+                                   2 + (((C) & 0x8) >> 3)); })
+
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
+                                  (__v4sf)_mm_setzero_ps(), \
+                                   (C) & 0x3, ((C) & 0xc) >> 2, \
+                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
+
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
+                                  (__v8sf)_mm256_setzero_ps(), \
+                                  (C) & 0x3, ((C) & 0xc) >> 2, \
+                                  ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
+                                  4 + (((C) & 0x03) >> 0), \
+                                  4 + (((C) & 0x0c) >> 2), \
+                                  4 + (((C) & 0x30) >> 4), \
+                                  4 + (((C) & 0xc0) >> 6)); })
+
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                           (__v4df)(__m256d)(V2), (M)); })
+
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                          (__v8sf)(__m256)(V2), (M)); })
+
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                           (__v8si)(__m256i)(V2), (M)); })
+
+/* Vector Blend */
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
+                                   (__v4df)(__m256d)(V2), \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                 (__v8sf)(__m256)(V2), (M)); })
+
+/* Vector shuffle */
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+        (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                        (__v8sf)(__m256)(b), \
+                                        (mask) & 0x3, \
+                                        ((mask) & 0xc) >> 2, \
+                                        (((mask) & 0x30) >> 4) + 8, \
+                                        (((mask) & 0xc0) >> 6) + 8, \
+                                        ((mask) & 0x3) + 4, \
+                                        (((mask) & 0xc) >> 2) + 4, \
+                                        (((mask) & 0x30) >> 4) + 12, \
+                                        (((mask) & 0xc0) >> 6) + 12); })
+
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+        (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), \
+                                         (mask) & 0x1, \
+                                         (((mask) & 0x2) >> 1) + 4, \
+                                         (((mask) & 0x4) >> 2) + 2, \
+                                         (((mask) & 0x8) >> 3) + 6); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), (c)); })
+
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), (c)); })
+
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi32(__m256i __a, const int __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi16(__m256i __a, const int __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return __b[__imm & 15];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi8(__m256i __a, const int __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return __b[__imm & 31];
+}
+
+#ifdef __x86_64__
+static __inline long long  __DEFAULT_FN_ATTRS
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+/* Vector replicate */
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
+}
+
+/* SIMD load ops */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  __builtin_ia32_storeupd256(__p, (__v4df)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_storeups256(__p, (__v8sf)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
+}
+
+/* Conditional load ops */
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_maskload_pd(double const *__p, __m128i __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_maskload_pd(double const *__p, __m256i __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4di)__m);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_maskload_ps(float const *__p, __m128i __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_maskload_ps(float const *__p, __m256i __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
+}
+
+/* Conditional store ops */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+}
+
+/* Create vectors */
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_undefined_pd()
+{
+  return (__m256d)__builtin_ia32_undef256();
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_undefined_ps()
+{
+  return (__m256)__builtin_ia32_undef256();
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_undefined_si256()
+{
+  return (__m256i)__builtin_ia32_undef256();
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+              float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+                 int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+                 short __w11, short __w10, short __w09, short __w08,
+                 short __w07, short __w06, short __w05, short __w04,
+                 short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+                char __b27, char __b26, char __b25, char __b24,
+                char __b23, char __b22, char __b21, char __b20,
+                char __b19, char __b18, char __b17, char __b16,
+                char __b15, char __b14, char __b13, char __b12,
+                char __b11, char __b10, char __b09, char __b08,
+                char __b07, char __b06, char __b05, char __b04,
+                char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+               float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+                  int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+       short __w11, short __w10, short __w09, short __w08,
+       short __w07, short __w06, short __w05, short __w04,
+       short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+                 char __b27, char __b26, char __b25, char __b24,
+                 char __b23, char __b22, char __b21, char __b20,
+                 char __b19, char __b18, char __b17, char __b16,
+                 char __b15, char __b14, char __b13, char __b12,
+                 char __b11, char __b10, char __b09, char __b08,
+                 char __b07, char __b06, char __b05, char __b04,
+                 char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
+}
+
+/*
+   Vector insert.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V1), \
+    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
+    (((M) & 1) ?  0 :  8), \
+    (((M) & 1) ?  1 :  9), \
+    (((M) & 1) ?  2 : 10), \
+    (((M) & 1) ?  3 : 11), \
+    (((M) & 1) ?  8 :  4), \
+    (((M) & 1) ?  9 :  5), \
+    (((M) & 1) ? 10 :  6), \
+    (((M) & 1) ? 11 :  7) );})
+
+#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V1), \
+    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V1), \
+    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/*
+   Vector extract.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+#define _mm256_extractf128_ps(V, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V), \
+    (__v8sf)(_mm256_setzero_ps()), \
+    (((M) & 1) ? 4 : 0), \
+    (((M) & 1) ? 5 : 1), \
+    (((M) & 1) ? 6 : 2), \
+    (((M) & 1) ? 7 : 3) );})
+
+#define _mm256_extractf128_pd(V, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V), \
+    (__v4df)(_mm256_setzero_pd()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+#define _mm256_extractf128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V), \
+    (__v4di)(_mm256_setzero_si256()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/* SIMD load ops (unaligned) */
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
+  return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+
+  __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
+  return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  __m256i __v256 = _mm256_castsi128_si256(
+    ((struct __loadu_si128*)__addr_lo)->__v);
+  return _mm256_insertf128_si256(__v256,
+                                 ((struct __loadu_si128*)__addr_hi)->__v, 1);
+}
+
+/* SIMD store ops (unaligned) */
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  __builtin_ia32_storeups(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  __builtin_ia32_storeups(__addr_hi, __v128);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  __builtin_ia32_storeupd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  __builtin_ia32_storeupd(__addr_hi, __v128);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_m128 (__m128 __hi, __m128 __lo) {
+  return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_m128d (__m128d __hi, __m128d __lo) {
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_m128i (__m128i __hi, __m128i __lo) {
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
+  return _mm256_set_m128(__hi, __lo);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVXINTRIN_H */

diff --git a/25.0.2/clang-include/bmi2intrin.h b/25.0.2/clang-include/bmi2intrin.h
new file mode 100644
index 0000000..fdae82c
--- /dev/null
+++ b/25.0.2/clang-include/bmi2intrin.h

@@ -0,0 +1,95 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */

diff --git a/25.0.2/clang-include/bmiintrin.h b/25.0.2/clang-include/bmiintrin.h
new file mode 100644
index 0000000..da98792
--- /dev/null
+++ b/25.0.2/clang-include/bmiintrin.h

@@ -0,0 +1,155 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+#define _blsr_u32(a)      (__blsr_u32((a)))
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+  return __X ? __builtin_ctzs(__X) : 16;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+#define _blsr_u64(a)      (__blsr_u64((a)))
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __RELAXED_FN_ATTRS
+
+#endif /* __BMIINTRIN_H */

diff --git a/25.0.2/clang-include/cpuid.h b/25.0.2/clang-include/cpuid.h
new file mode 100644
index 0000000..5da02e0
--- /dev/null
+++ b/25.0.2/clang-include/cpuid.h

@@ -0,0 +1,209 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Responses identification request with %eax 0 */
+/* AMD:     "AuthenticAMD" */
+#define signature_AMD_ebx 0x68747541
+#define signature_AMD_edx 0x69746e65
+#define signature_AMD_ecx 0x444d4163
+/* CENTAUR: "CentaurHauls" */
+#define signature_CENTAUR_ebx 0x746e6543
+#define signature_CENTAUR_edx 0x48727561
+#define signature_CENTAUR_ecx 0x736c7561
+/* CYRIX:   "CyrixInstead" */
+#define signature_CYRIX_ebx 0x69727943
+#define signature_CYRIX_edx 0x736e4978
+#define signature_CYRIX_ecx 0x64616574
+/* INTEL:   "GenuineIntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+/* TM1:     "TransmetaCPU" */
+#define signature_TM1_ebx 0x6e617254
+#define signature_TM1_edx 0x74656d73
+#define signature_TM1_ecx 0x55504361
+/* TM2:     "GenuineTMx86" */
+#define signature_TM2_ebx 0x756e6547
+#define signature_TM2_edx 0x54656e69
+#define signature_TM2_ecx 0x3638784d
+/* NSC:     "Geode by NSC" */
+#define signature_NSC_ebx 0x646f6547
+#define signature_NSC_edx 0x43534e20
+#define signature_NSC_ecx 0x79622065
+/* NEXGEN:  "NexGenDriven" */
+#define signature_NEXGEN_ebx 0x4778654e
+#define signature_NEXGEN_edx 0x72446e65
+#define signature_NEXGEN_ecx 0x6e657669
+/* RISE:    "RiseRiseRise" */
+#define signature_RISE_ebx 0x65736952
+#define signature_RISE_edx 0x65736952
+#define signature_RISE_ecx 0x65736952
+/* SIS:     "SiS SiS SiS " */
+#define signature_SIS_ebx 0x20536953
+#define signature_SIS_edx 0x20536953
+#define signature_SIS_ecx 0x20536953
+/* UMC:     "UMC UMC UMC " */
+#define signature_UMC_ebx 0x20434d55
+#define signature_UMC_edx 0x20434d55
+#define signature_UMC_ecx 0x20434d55
+/* VIA:     "VIA VIA VIA " */
+#define signature_VIA_ebx 0x20414956
+#define signature_VIA_edx 0x20414956
+#define signature_VIA_ecx 0x20414956
+/* VORTEX:  "Vortex86 SoC" */
+#define signature_VORTEX_ebx 0x74726f56
+#define signature_VORTEX_edx 0x36387865
+#define signature_VORTEX_ecx 0x436f5320
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE42       0x00100000
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_RDRND       0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR    /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}

diff --git a/25.0.2/clang-include/cuda_builtin_vars.h b/25.0.2/clang-include/cuda_builtin_vars.h
new file mode 100644
index 0000000..901356b
--- /dev/null
+++ b/25.0.2/clang-include/cuda_builtin_vars.h

@@ -0,0 +1,110 @@
+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_BUILTIN_VARS_H
+#define __CUDA_BUILTIN_VARS_H
+
+// The file implements built-in CUDA variables using __declspec(property).
+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
+// All read accesses of built-in variable fields get converted into calls to a
+// getter function which in turn would call appropriate builtin to fetch the
+// value.
+//
+// Example:
+//    int x = threadIdx.x;
+// IR output:
+//  %0 = call i32 @llvm.ptx.read.tid.x() #3
+// PTX output:
+//  mov.u32     %r2, %tid.x;
+
+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
+  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
+  static inline __attribute__((always_inline))                                 \
+      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
+    return INTRINSIC;                                                          \
+  }
+
+#if __cplusplus >= 201103L
+#define __DELETE =delete
+#else
+#define __DELETE
+#endif
+
+// Make sure nobody can create instances of the special varible types.  nvcc
+// also disallows taking address of special variables, so we disable address-of
+// operator as well.
+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
+  __attribute__((device)) TypeName() __DELETE;                                 \
+  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
+  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
+  __attribute__((device)) TypeName *operator&() const __DELETE
+
+struct __cuda_builtin_threadIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
+};
+
+struct __cuda_builtin_blockIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
+};
+
+struct __cuda_builtin_blockDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
+};
+
+struct __cuda_builtin_gridDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z());
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
+};
+
+#define __CUDA_BUILTIN_VAR                                                     \
+  extern const __attribute__((device)) __attribute__((weak))
+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
+
+// warpSize should translate to read of %WARP_SZ but there's currently no
+// builtin to do so. According to PTX v4.2 docs 'to date, all target
+// architectures have a WARP_SZ value of 32'.
+__attribute__((device)) const int warpSize = 32;
+
+#undef __CUDA_DEVICE_BUILTIN
+#undef __CUDA_BUILTIN_VAR
+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+
+#endif /* __CUDA_BUILTIN_VARS_H */

diff --git a/25.0.2/clang-include/emmintrin.h b/25.0.2/clang-include/emmintrin.h
new file mode 100644
index 0000000..cfc2c71
--- /dev/null
+++ b/25.0.2/clang-include/emmintrin.h

@@ -0,0 +1,1491 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v16qs __attribute__((__vector_size__(16)));
+
+#include <f16cintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd(__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplesd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmplesd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtps_pd(__m128 __a)
+{
+  return __builtin_ia32_cvtps2pd(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2pd((__v4si)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi(__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi(__a);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector(__u, __u, 1, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_undefined_pd()
+{
+  return (__m128d)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  struct __mm_store1_pd_struct {
+    double __u[2];
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
+  ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  __builtin_ia32_storeupd(__dp, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a + (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a + (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a + (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_paddq(__a, __b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a * (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a - (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a - (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a - (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psubq(__a, __b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return __a & __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return ~__a & __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return __a | __b;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return __a ^ __b;
+}
+
+#define _mm_slli_si128(a, imm) __extension__ ({                         \
+  (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
+                                   (__v16qi)(__m128i)(a),               \
+                                   ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
+                                   ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
+
+#define _mm_bslli_si128(a, imm) \
+  _mm_slli_si128((a), (imm))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+#define _mm_srli_si128(a, imm) __extension__ ({                          \
+  (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
+                                   (__v16qi)_mm_setzero_si128(),         \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9,  \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
+                                   ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
+
+#define _mm_bsrli_si128(a, imm) \
+  _mm_srli_si128((a), (imm))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128(__a, __count);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64(__a);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq(__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq(__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_undefined_si128()
+{
+  return (__m128i)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64x(long long __q1, long long __q0)
+{
+  return (__m128i){ __q0, __q1 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64(__m64 __q1, __m64 __q0)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi64(__m64 __q0, __m64 __q1)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntpd(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_ia32_movntdq(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflush(void const *__p)
+{
+  __builtin_ia32_clflush(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_lfence(void)
+{
+  __builtin_ia32_lfence();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mfence(void)
+{
+  __builtin_ia32_mfence();
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
+                                   (__v4si)_mm_setzero_si128(), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_setzero_si128(), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7); })
+
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_setzero_si128(), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 2+1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 2+0);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd(__a);
+}
+
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                   (i) & 1, (((i) & 2) >> 1) + 2); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_pause(void)
+{
+  __builtin_ia32_pause();
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __EMMINTRIN_H */

diff --git a/25.0.2/clang-include/f16cintrin.h b/25.0.2/clang-include/f16cintrin.h
new file mode 100644
index 0000000..c655d98
--- /dev/null
+++ b/25.0.2/clang-include/f16cintrin.h

@@ -0,0 +1,45 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __F16CINTRIN_H */

diff --git a/25.0.2/clang-include/float.h b/25.0.2/clang-include/float.h
new file mode 100644
index 0000000..238cf76
--- /dev/null
+++ b/25.0.2/clang-include/float.h

@@ -0,0 +1,124 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ */
+#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
+    __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  undef DECIMAL_DIG
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#define DECIMAL_DIG __DECIMAL_DIG__
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+#endif /* __FLOAT_H */

diff --git a/25.0.2/clang-include/fma4intrin.h b/25.0.2/clang-include/fma4intrin.h
new file mode 100644
index 0000000..f117887
--- /dev/null
+++ b/25.0.2/clang-include/fma4intrin.h

@@ -0,0 +1,230 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMA4INTRIN_H */

diff --git a/25.0.2/clang-include/fmaintrin.h b/25.0.2/clang-include/fmaintrin.h
new file mode 100644
index 0000000..114a143
--- /dev/null
+++ b/25.0.2/clang-include/fmaintrin.h

@@ -0,0 +1,228 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMAINTRIN_H */

diff --git a/25.0.2/clang-include/fxsrintrin.h b/25.0.2/clang-include/fxsrintrin.h
new file mode 100644
index 0000000..ac6026a
--- /dev/null
+++ b/25.0.2/clang-include/fxsrintrin.h

@@ -0,0 +1,55 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p) {
+  return __builtin_ia32_fxsave(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p) {
+  return __builtin_ia32_fxsave64(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p) {
+  return __builtin_ia32_fxrstor(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p) {
+  return __builtin_ia32_fxrstor64(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/htmintrin.h b/25.0.2/clang-include/htmintrin.h
new file mode 100644
index 0000000..0088c7c
--- /dev/null
+++ b/25.0.2/clang-include/htmintrin.h

@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED        0x1
+#define _HTM_TRANSACTIONAL    0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin  */
+#define _HTM_TBEGIN_STARTED       0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT     2
+#define _HTM_TBEGIN_PERSISTENT    3
+
+/* The abort codes below this threshold are reserved for machine use.  */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+   of Operation chapter 5-91.  */
+
+struct __htm_tdb {
+  unsigned char format;                /*   0 */
+  unsigned char flags;
+  unsigned char reserved1[4];
+  unsigned short nesting_depth;
+  unsigned long long abort_code;       /*   8 */
+  unsigned long long conflict_token;   /*  16 */
+  unsigned long long atia;             /*  24 */
+  unsigned char eaid;                  /*  32 */
+  unsigned char dxc;
+  unsigned char reserved2[2];
+  unsigned int program_int_id;
+  unsigned long long exception_id;     /*  40 */
+  unsigned long long bea;              /*  48 */
+  unsigned char reserved3[72];         /*  56 */
+  unsigned long long gprs[16];         /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure.  */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *tdb, int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_null(retry) : \
+   __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_nofloat_null(retry) : \
+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */

diff --git a/25.0.2/clang-include/htmxlintrin.h b/25.0.2/clang-include/htmxlintrin.h
new file mode 100644
index 0000000..c7571ec
--- /dev/null
+++ b/25.0.2/clang-include/htmxlintrin.h

@@ -0,0 +1,363 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) \
+  ((texasr_t *)((TM_BUF)+0))
+#define _TEXASRU_PTR(TM_BUF) \
+  ((texasru_t *)((TM_BUF)+0))
+#define _TEXASRL_PTR(TM_BUF) \
+  ((texasrl_t *)((TM_BUF)+4))
+#define _TFIAR_PTR(TM_BUF) \
+  ((tfiar_t *)((TM_BUF)+8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully
+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+   below.  */
+#define _HTM_TBEGIN_STARTED     1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const TM_buff)
+{
+  *_TEXASRL_PTR (TM_buff) = 0;
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+  *_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
+#else
+  *_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
+#endif
+  *_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+  if (__builtin_expect (__builtin_tend (0), 1))
+    return 1;
+  return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+  __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const code)
+{
+  __builtin_tabort (code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+  __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+  __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+
+  *code = _TEXASRU_FAILURE_CODE (texasru);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const TM_buff)
+{
+  texasrl_t texasrl;
+
+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+    {
+      texasrl = *_TEXASRL_PTR (TM_buff);
+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+        texasrl = 0;
+    }
+  else
+    texasrl = (texasrl_t) __builtin_get_texasr ();
+
+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
+     14 (Translation Invalidation Conflict).  */
+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const TM_buff)
+{
+  return *_TFIAR_PTR (TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const TM_buff)
+{
+  return *_TEXASR_PTR (TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+   the IBM XL compiler.  For documentation please see the "z/OS XL
+   C/C++ Programming Guide" publically available on the web.  */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+  return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const tdb)
+{
+  return __builtin_tbegin_nofloat (tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+  return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const code)
+{
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const addr, long long const value)
+{
+  __builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const tdb_ptr)
+{
+  int depth = __builtin_tx_nesting_depth ();
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (depth != 0)
+    return depth;
+
+  if (tdb->format != 1)
+    return 0;
+  return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+    {
+      *code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      return 1;
+    }
+  return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
+	      || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 7 /* fetch overflow */
+	      || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 9 /* fetch conflict */
+	      || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const result)
+{
+  return result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+  return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
+
+  return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H  */

diff --git a/25.0.2/clang-include/ia32intrin.h b/25.0.2/clang-include/ia32intrin.h
new file mode 100644
index 0000000..5adf3f1
--- /dev/null
+++ b/25.0.2/clang-include/ia32intrin.h

@@ -0,0 +1,101 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned long long __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popq %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __asm__ __volatile__ ("pushq %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  unsigned int __res = 0;
+  __asm__ __volatile__ ("pushf\n\t"
+                        "popl %0\n"
+                        :"=r"(__res)
+                        :
+                        :
+                       );
+  return __res;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __asm__ __volatile__ ("pushl %0\n\t"
+                        "popf\n"
+                        :
+                        :"r"(__f)
+                        :"flags"
+                       );
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtsc */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtsc(void) {
+  return __builtin_ia32_rdtsc();
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#endif /* __IA32INTRIN_H */

diff --git a/25.0.2/clang-include/immintrin.h b/25.0.2/clang-include/immintrin.h
new file mode 100644
index 0000000..f3c6d19
--- /dev/null
+++ b/25.0.2/clang-include/immintrin.h

@@ -0,0 +1,172 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#include <wmmintrin.h>
+
+#include <avxintrin.h>
+
+#include <avx2intrin.h>
+
+/* The 256-bit versions of functions in f16cintrin.h.
+   Intel documents these as being in immintrin.h, and
+   they depend on typedefs from avxintrin.h. */
+
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+
+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <lzcntintrin.h>
+
+#include <fmaintrin.h>
+
+#include <avx512fintrin.h>
+
+#include <avx512vlintrin.h>
+
+#include <avx512bwintrin.h>
+
+#include <avx512cdintrin.h>
+
+#include <avx512dqintrin.h>
+
+#include <avx512vlbwintrin.h>
+
+#include <avx512vldqintrin.h>
+
+#include <avx512erintrin.h>
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrgsbase64(__V);
+}
+#endif
+
+#include <rtmintrin.h>
+
+#include <xtestintrin.h>
+
+#include <shaintrin.h>
+
+#include <fxsrintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <xsavecintrin.h>
+
+#include <xsavesintrin.h>
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
+#include <adxintrin.h>
+
+#endif /* __IMMINTRIN_H */

diff --git a/25.0.2/clang-include/inttypes.h b/25.0.2/clang-include/inttypes.h
new file mode 100644
index 0000000..3d59d14
--- /dev/null
+++ b/25.0.2/clang-include/inttypes.h

@@ -0,0 +1,102 @@
+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_INTTYPES_H
+#define __CLANG_INTTYPES_H
+
+#include_next <inttypes.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
+ * This triggers format warnings, so fix it up here. */
+#undef PRId32
+#undef PRIdLEAST32
+#undef PRIdFAST32
+#undef PRIi32
+#undef PRIiLEAST32
+#undef PRIiFAST32
+#undef PRIo32
+#undef PRIoLEAST32
+#undef PRIoFAST32
+#undef PRIu32
+#undef PRIuLEAST32
+#undef PRIuFAST32
+#undef PRIx32
+#undef PRIxLEAST32
+#undef PRIxFAST32
+#undef PRIX32
+#undef PRIXLEAST32
+#undef PRIXFAST32
+
+#undef SCNd32
+#undef SCNdLEAST32
+#undef SCNdFAST32
+#undef SCNi32
+#undef SCNiLEAST32
+#undef SCNiFAST32
+#undef SCNo32
+#undef SCNoLEAST32
+#undef SCNoFAST32
+#undef SCNu32
+#undef SCNuLEAST32
+#undef SCNuFAST32
+#undef SCNx32
+#undef SCNxLEAST32
+#undef SCNxFAST32
+
+#define PRId32 "d"
+#define PRIdLEAST32 "d"
+#define PRIdFAST32 "d"
+#define PRIi32 "i"
+#define PRIiLEAST32 "i"
+#define PRIiFAST32 "i"
+#define PRIo32 "o"
+#define PRIoLEAST32 "o"
+#define PRIoFAST32 "o"
+#define PRIu32 "u"
+#define PRIuLEAST32 "u"
+#define PRIuFAST32 "u"
+#define PRIx32 "x"
+#define PRIxLEAST32 "x"
+#define PRIxFAST32 "x"
+#define PRIX32 "X"
+#define PRIXLEAST32 "X"
+#define PRIXFAST32 "X"
+
+#define SCNd32 "d"
+#define SCNdLEAST32 "d"
+#define SCNdFAST32 "d"
+#define SCNi32 "i"
+#define SCNiLEAST32 "i"
+#define SCNiFAST32 "i"
+#define SCNo32 "o"
+#define SCNoLEAST32 "o"
+#define SCNoFAST32 "o"
+#define SCNu32 "u"
+#define SCNuLEAST32 "u"
+#define SCNuFAST32 "u"
+#define SCNx32 "x"
+#define SCNxLEAST32 "x"
+#define SCNxFAST32 "x"
+#endif
+
+#endif /* __CLANG_INTTYPES_H */

diff --git a/25.0.2/clang-include/iso646.h b/25.0.2/clang-include/iso646.h
new file mode 100644
index 0000000..dca13c5
--- /dev/null
+++ b/25.0.2/clang-include/iso646.h

@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */

diff --git a/25.0.2/clang-include/limits.h b/25.0.2/clang-include/limits.h
new file mode 100644
index 0000000..f04187c
--- /dev/null
+++ b/25.0.2/clang-include/limits.h

@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */

diff --git a/25.0.2/clang-include/lzcntintrin.h b/25.0.2/clang-include/lzcntintrin.h
new file mode 100644
index 0000000..4c00e42
--- /dev/null
+++ b/25.0.2/clang-include/lzcntintrin.h

@@ -0,0 +1,68 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__lzcnt16(unsigned short __X)
+{
+  return __X ? __builtin_clzs(__X) : 16;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__lzcnt64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */

diff --git a/25.0.2/clang-include/mm3dnow.h b/25.0.2/clang-include/mm3dnow.h
new file mode 100644
index 0000000..cb93faf
--- /dev/null
+++ b/25.0.2/clang-include/mm3dnow.h

@@ -0,0 +1,171 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_femms() {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/mm_malloc.h b/25.0.2/clang-include/mm_malloc.h
new file mode 100644
index 0000000..305afd3
--- /dev/null
+++ b/25.0.2/clang-include/mm_malloc.h

@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */

diff --git a/25.0.2/clang-include/mmintrin.h b/25.0.2/clang-include/mmintrin.h
new file mode 100644
index 0000000..162cb1a
--- /dev/null
+++ b/25.0.2/clang-include/mmintrin.h

@@ -0,0 +1,503 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi(__m, __count);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor(__m1, __m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_from_int64 _mm_cvtsi64_m64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_to_int64 _mm_cvtm64_si64
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMINTRIN_H */
+

diff --git a/25.0.2/clang-include/module.modulemap b/25.0.2/clang-include/module.modulemap
new file mode 100644
index 0000000..b147e89
--- /dev/null
+++ b/25.0.2/clang-include/module.modulemap

@@ -0,0 +1,171 @@
+module _Builtin_intrinsics [system] [extern_c] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module acle {
+      header "arm_acle.h"
+      export *
+    }
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    header "x86intrin.h"
+
+    explicit module mm_malloc {
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      header "mmintrin.h"
+    }
+
+    explicit module f16c {
+      header "f16cintrin.h"
+    }
+
+    explicit module sse {
+      export mmx
+      export sse2 // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module avx {
+      export sse4_2
+      header "avxintrin.h"
+    }
+
+    explicit module avx2 {
+      export avx
+      header "avx2intrin.h"
+    }
+
+    explicit module avx512f {
+      export avx2
+      header "avx512fintrin.h"
+    }
+
+    explicit module avx512er {
+      header "avx512erintrin.h"
+    }
+
+    explicit module bmi {
+      header "bmiintrin.h"
+    }
+
+    explicit module bmi2 {
+      header "bmi2intrin.h"
+    }
+
+    explicit module fma {
+      header "fmaintrin.h"
+    }
+
+    explicit module fma4 {
+      export sse3
+      header "fma4intrin.h"
+    }
+
+    explicit module lzcnt {
+      header "lzcntintrin.h"
+    }
+
+    explicit module popcnt {
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      header "mm3dnow.h"
+    }
+
+    explicit module xop {
+      export fma4
+      header "xopintrin.h"
+    }
+
+    explicit module aes_pclmul {
+      header "wmmintrin.h"
+      export aes
+      export pclmul
+    }
+
+    explicit module aes {
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+
+  explicit module systemz {
+    requires systemz
+    export *
+
+    header "s390intrin.h"
+
+    explicit module htm {
+      requires htm
+      header "htmintrin.h"
+      header "htmxlintrin.h"
+    }
+
+    explicit module zvector {
+      requires zvector, vx
+      header "vecintrin.h"
+    }
+  }
+}
+
+module _Builtin_stddef_max_align_t [system] [extern_c] {
+  header "__stddef_max_align_t.h"
+}

diff --git a/25.0.2/clang-include/nmmintrin.h b/25.0.2/clang-include/nmmintrin.h
new file mode 100644
index 0000000..57fec15
--- /dev/null
+++ b/25.0.2/clang-include/nmmintrin.h

@@ -0,0 +1,30 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* _NMMINTRIN_H */

diff --git a/25.0.2/clang-include/pmmintrin.h b/25.0.2/clang-include/pmmintrin.h
new file mode 100644
index 0000000..0ff9409
--- /dev/null
+++ b/25.0.2/clang-include/pmmintrin.h

@@ -0,0 +1,116 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd(__a, __b);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd(__a, __b);
+}
+
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */

diff --git a/25.0.2/clang-include/popcntintrin.h b/25.0.2/clang-include/popcntintrin.h
new file mode 100644
index 0000000..6fcda65
--- /dev/null
+++ b/25.0.2/clang-include/popcntintrin.h

@@ -0,0 +1,58 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_popcnt32(int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_popcnt64(long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _POPCNTINTRIN_H */

diff --git a/25.0.2/clang-include/prfchwintrin.h b/25.0.2/clang-include/prfchwintrin.h
new file mode 100644
index 0000000..ba02857
--- /dev/null
+++ b/25.0.2/clang-include/prfchwintrin.h

@@ -0,0 +1,45 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */

diff --git a/25.0.2/clang-include/rdseedintrin.h b/25.0.2/clang-include/rdseedintrin.h
new file mode 100644
index 0000000..421f4ea
--- /dev/null
+++ b/25.0.2/clang-include/rdseedintrin.h

@@ -0,0 +1,56 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */

diff --git a/25.0.2/clang-include/rtmintrin.h b/25.0.2/clang-include/rtmintrin.h
new file mode 100644
index 0000000..e6a58d7
--- /dev/null
+++ b/25.0.2/clang-include/rtmintrin.h

@@ -0,0 +1,59 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */

diff --git a/25.0.2/clang-include/s390intrin.h b/25.0.2/clang-include/s390intrin.h
new file mode 100644
index 0000000..d51274c
--- /dev/null
+++ b/25.0.2/clang-include/s390intrin.h

@@ -0,0 +1,39 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#ifdef __VEC__
+#include <vecintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/

diff --git a/25.0.2/clang-include/shaintrin.h b/25.0.2/clang-include/shaintrin.h
new file mode 100644
index 0000000..9b5d218
--- /dev/null
+++ b/25.0.2/clang-include/shaintrin.h

@@ -0,0 +1,75 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SHAINTRIN_H */

diff --git a/25.0.2/clang-include/smmintrin.h b/25.0.2/clang-include/smmintrin.h
new file mode 100644
index 0000000..69ad07f
--- /dev/null
+++ b/25.0.2/clang-include/smmintrin.h

@@ -0,0 +1,508 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#include <tmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                 (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                  (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
+                                   (__v2df)(__m128d)(V2), \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
+                                   (__v8hi)(__m128i)(V2), \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                               (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128 (__m128i const *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(__m128)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__                           \
+                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                     __a[(N) & 15] = (I);                 \
+                                     __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__                         \
+                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                      __a[(N) & 3] = (I);                \
+                                      __a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__                         \
+                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                      __a[(N) & 1] = (I);                \
+                                      __a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__                           \
+                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__                         \
+                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                    (int)__a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__                         \
+                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                    (long long)__a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  typedef signed char __v16qs __attribute__((__vector_size__(16)));
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                      (__v16qi)(__m128i)(Y), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* Handle the sse4.2 definitions here. */
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) \
+  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                       (__v16qi)(__m128i)(B), (int)(LB), \
+                                       (int)(M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                   (__v16qi)(__m128i)(B), (int)(LB), \
+                                   (int)(M))
+
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrc(A, B, M) \
+  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistro(A, B, M) \
+  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrs(A, B, M) \
+  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrz(A, B, M) \
+  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* _SMMINTRIN_H */

diff --git a/25.0.2/clang-include/stdalign.h b/25.0.2/clang-include/stdalign.h
new file mode 100644
index 0000000..3738d12
--- /dev/null
+++ b/25.0.2/clang-include/stdalign.h

@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */

diff --git a/25.0.2/clang-include/stdarg.h b/25.0.2/clang-include/stdarg.h
new file mode 100644
index 0000000..a57e183
--- /dev/null
+++ b/25.0.2/clang-include/stdarg.h

@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */

diff --git a/25.0.2/clang-include/stdatomic.h b/25.0.2/clang-include/stdatomic.h
new file mode 100644
index 0000000..e037987
--- /dev/null
+++ b/25.0.2/clang-include/stdatomic.h

@@ -0,0 +1,190 @@
+/*===---- stdatomic.h - Standard header for atomic types and operations -----===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_STDATOMIC_H
+#define __CLANG_STDATOMIC_H
+
+/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
+ * example, already has a Clang-compatible stdatomic.h header.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
+# include_next <stdatomic.h>
+#else
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 7.17.1 Introduction */
+
+#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_T_LOCK_FREE    __GCC_ATOMIC_SHORT_T_LOCK_FREE
+#define ATOMIC_INT_T_LOCK_FREE      __GCC_ATOMIC_INT_T_LOCK_FREE
+#define ATOMIC_LONG_T_LOCK_FREE     __GCC_ATOMIC_LONG_T_LOCK_FREE
+#define ATOMIC_LLONG_T_LOCK_FREE    __GCC_ATOMIC_LLONG_T_LOCK_FREE
+#define ATOMIC_POINTER_T_LOCK_FREE  __GCC_ATOMIC_POINTER_T_LOCK_FREE
+
+/* 7.17.2 Initialization */
+
+#define ATOMIC_VAR_INIT(value) (value)
+#define atomic_init __c11_atomic_init
+
+/* 7.17.3 Order and consistency */
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#define kill_dependency(y) (y)
+
+/* 7.17.4 Fences */
+
+/* These should be provided by the libc implementation. */
+void atomic_thread_fence(memory_order);
+void atomic_signal_fence(memory_order);
+
+#define atomic_thread_fence(order) __c11_atomic_thread_fence(order)
+#define atomic_signal_fence(order) __c11_atomic_signal_fence(order)
+
+/* 7.17.5 Lock-free property */
+
+#define atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
+
+/* 7.17.6 Atomic integer types */
+
+#ifdef __cplusplus
+typedef _Atomic(bool)               atomic_bool;
+#else
+typedef _Atomic(_Bool)              atomic_bool;
+#endif
+typedef _Atomic(char)               atomic_char;
+typedef _Atomic(signed char)        atomic_schar;
+typedef _Atomic(unsigned char)      atomic_uchar;
+typedef _Atomic(short)              atomic_short;
+typedef _Atomic(unsigned short)     atomic_ushort;
+typedef _Atomic(int)                atomic_int;
+typedef _Atomic(unsigned int)       atomic_uint;
+typedef _Atomic(long)               atomic_long;
+typedef _Atomic(unsigned long)      atomic_ulong;
+typedef _Atomic(long long)          atomic_llong;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(uint_least16_t)     atomic_char16_t;
+typedef _Atomic(uint_least32_t)     atomic_char32_t;
+typedef _Atomic(wchar_t)            atomic_wchar_t;
+typedef _Atomic(int_least8_t)       atomic_int_least8_t;
+typedef _Atomic(uint_least8_t)      atomic_uint_least8_t;
+typedef _Atomic(int_least16_t)      atomic_int_least16_t;
+typedef _Atomic(uint_least16_t)     atomic_uint_least16_t;
+typedef _Atomic(int_least32_t)      atomic_int_least32_t;
+typedef _Atomic(uint_least32_t)     atomic_uint_least32_t;
+typedef _Atomic(int_least64_t)      atomic_int_least64_t;
+typedef _Atomic(uint_least64_t)     atomic_uint_least64_t;
+typedef _Atomic(int_fast8_t)        atomic_int_fast8_t;
+typedef _Atomic(uint_fast8_t)       atomic_uint_fast8_t;
+typedef _Atomic(int_fast16_t)       atomic_int_fast16_t;
+typedef _Atomic(uint_fast16_t)      atomic_uint_fast16_t;
+typedef _Atomic(int_fast32_t)       atomic_int_fast32_t;
+typedef _Atomic(uint_fast32_t)      atomic_uint_fast32_t;
+typedef _Atomic(int_fast64_t)       atomic_int_fast64_t;
+typedef _Atomic(uint_fast64_t)      atomic_uint_fast64_t;
+typedef _Atomic(intptr_t)           atomic_intptr_t;
+typedef _Atomic(uintptr_t)          atomic_uintptr_t;
+typedef _Atomic(size_t)             atomic_size_t;
+typedef _Atomic(ptrdiff_t)          atomic_ptrdiff_t;
+typedef _Atomic(intmax_t)           atomic_intmax_t;
+typedef _Atomic(uintmax_t)          atomic_uintmax_t;
+
+/* 7.17.7 Operations on atomic types */
+
+#define atomic_store(object, desired) __c11_atomic_store(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_store_explicit __c11_atomic_store
+
+#define atomic_load(object) __c11_atomic_load(object, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit __c11_atomic_load
+
+#define atomic_exchange(object, desired) __c11_atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_exchange_explicit __c11_atomic_exchange
+
+#define atomic_compare_exchange_strong(object, expected, desired) __c11_atomic_compare_exchange_strong(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+
+#define atomic_compare_exchange_weak(object, expected, desired) __c11_atomic_compare_exchange_weak(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+
+#define atomic_fetch_sub(object, operand) __c11_atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+
+#define atomic_fetch_or(object, operand) __c11_atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+
+#define atomic_fetch_xor(object, operand) __c11_atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+
+#define atomic_fetch_and(object, operand) __c11_atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+
+/* 7.17.8 Atomic flag type and operations */
+
+typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
+
+#define ATOMIC_FLAG_INIT { 0 }
+
+/* These should be provided by the libc implementation. */
+#ifdef __cplusplus
+bool atomic_flag_test_and_set(volatile atomic_flag *);
+bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#else
+_Bool atomic_flag_test_and_set(volatile atomic_flag *);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#endif
+void atomic_flag_clear(volatile atomic_flag *);
+void atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
+
+#define atomic_flag_test_and_set(object) __c11_atomic_exchange(&(object)->_Value, 1, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set_explicit(object, order) __c11_atomic_exchange(&(object)->_Value, 1, order)
+
+#define atomic_flag_clear(object) __c11_atomic_store(&(object)->_Value, 0, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear_explicit(object, order) __c11_atomic_store(&(object)->_Value, 0, order)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDATOMIC_H */
+

diff --git a/25.0.2/clang-include/stdbool.h b/25.0.2/clang-include/stdbool.h
new file mode 100644
index 0000000..0467893
--- /dev/null
+++ b/25.0.2/clang-include/stdbool.h

@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */

diff --git a/25.0.2/clang-include/stddef.h b/25.0.2/clang-include/stddef.h
new file mode 100644
index 0000000..7354996
--- /dev/null
+++ b/25.0.2/clang-include/stddef.h

@@ -0,0 +1,137 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+/* Always define miscellaneous pieces when modules are available. */
+#if !__has_feature(modules)
+#define __STDDEF_H
+#endif
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+#define __need_STDDEF_H_misc
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__need_STDDEF_H_misc)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__need_STDDEF_H_misc) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__need_STDDEF_H_misc)
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#include "__stddef_max_align_t.h"
+#endif
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#undef __need_STDDEF_H_misc
+#endif  /* defined(__need_STDDEF_H_misc) */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif

diff --git a/25.0.2/clang-include/stdint.h b/25.0.2/clang-include/stdint.h
new file mode 100644
index 0000000..3f2fcbc
--- /dev/null
+++ b/25.0.2/clang-include/stdint.h

@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types.
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef __UINT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef __INT56_TYPE__ int56_t;
+typedef __UINT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef __INT48_TYPE__ int48_t;
+typedef __UINT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef __INT40_TYPE__ int40_t;
+typedef __UINT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef __UINT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef __INT24_TYPE__ int24_t;
+typedef __UINT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef __UINT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef __UINT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types.
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */

diff --git a/25.0.2/clang-include/stdnoreturn.h b/25.0.2/clang-include/stdnoreturn.h
new file mode 100644
index 0000000..a7a301d
--- /dev/null
+++ b/25.0.2/clang-include/stdnoreturn.h

@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */

diff --git a/25.0.2/clang-include/tbmintrin.h b/25.0.2/clang-include/tbmintrin.h
new file mode 100644
index 0000000..785961c
--- /dev/null
+++ b/25.0.2/clang-include/tbmintrin.h

@@ -0,0 +1,154 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+
+#define __bextri_u32(a, b) \
+  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
+                                           (unsigned int)(b)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcfill_u32(unsigned int a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blci_u32(unsigned int a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcic_u32(unsigned int a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcmsk_u32(unsigned int a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcs_u32(unsigned int a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsfill_u32(unsigned int a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsic_u32(unsigned int a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__t1mskc_u32(unsigned int a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__tzmsk_u32(unsigned int a)
+{
+  return ~a & (a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) \
+  ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
+                                                 (unsigned long long)(b)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcfill_u64(unsigned long long a)
+{
+  return a & (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blci_u64(unsigned long long a)
+{
+  return a | ~(a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcic_u64(unsigned long long a)
+{
+  return ~a & (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcmsk_u64(unsigned long long a)
+{
+  return a ^ (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcs_u64(unsigned long long a)
+{
+  return a | (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsfill_u64(unsigned long long a)
+{
+  return a | (a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsic_u64(unsigned long long a)
+{
+  return ~a | (a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__t1mskc_u64(unsigned long long a)
+{
+  return ~a | (a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__tzmsk_u64(unsigned long long a)
+{
+  return ~a & (a - 1);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TBMINTRIN_H */

diff --git a/25.0.2/clang-include/tgmath.h b/25.0.2/clang-include/tgmath.h
new file mode 100644
index 0000000..318e118
--- /dev/null
+++ b/25.0.2/clang-include/tgmath.h

@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y)
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */

diff --git a/25.0.2/clang-include/tmmintrin.h b/25.0.2/clang-include/tmmintrin.h
new file mode 100644
index 0000000..0002890
--- /dev/null
+++ b/25.0.2/clang-include/tmmintrin.h

@@ -0,0 +1,221 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                     (__v16qi)(__m128i)(b), (n)); })
+
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TMMINTRIN_H */

diff --git a/25.0.2/clang-include/unwind.h b/25.0.2/clang-include/unwind.h
new file mode 100644
index 0000000..303d792
--- /dev/null
+++ b/25.0.2/clang-include/unwind.h

@@ -0,0 +1,282 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if defined(__APPLE__) && __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+_Unwind_Word _Unwind_GetBSP(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info(const void *, void *) __attribute__((__unavailable__));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_table(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */

diff --git a/25.0.2/clang-include/vadefs.h b/25.0.2/clang-include/vadefs.h
new file mode 100644
index 0000000..7fe9a74
--- /dev/null
+++ b/25.0.2/clang-include/vadefs.h

@@ -0,0 +1,65 @@
+/* ===-------- vadefs.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we are aiming for MSVC compatibility. */
+#ifndef _MSC_VER
+#include_next <vadefs.h>
+#else
+
+#ifndef __clang_vadefs_h
+#define __clang_vadefs_h
+
+#include_next <vadefs.h>
+
+/* Override macros from vadefs.h with definitions that work with Clang. */
+#ifdef _crt_va_start
+#undef _crt_va_start
+#define _crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef _crt_va_end
+#undef _crt_va_end
+#define _crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef _crt_va_arg
+#undef _crt_va_arg
+#define _crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+/* VS 2015 switched to double underscore names, which is an improvement, but now
+ * we have to intercept those names too.
+ */
+#ifdef __crt_va_start
+#undef __crt_va_start
+#define __crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef __crt_va_end
+#undef __crt_va_end
+#define __crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef __crt_va_arg
+#undef __crt_va_arg
+#define __crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+#endif
+#endif

diff --git a/25.0.2/clang-include/varargs.h b/25.0.2/clang-include/varargs.h
new file mode 100644
index 0000000..b5477d0
--- /dev/null
+++ b/25.0.2/clang-include/varargs.h

@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif

diff --git a/25.0.2/clang-include/vecintrin.h b/25.0.2/clang-include/vecintrin.h
new file mode 100644
index 0000000..ca7acb4
--- /dev/null
+++ b/25.0.2/clang-include/vecintrin.h

@@ -0,0 +1,8946 @@
+/*===---- vecintrin.h - Vector intrinsics ----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if defined(__s390x__) && defined(__VEC__)
+
+#define __ATTRS_ai __attribute__((__always_inline__))
+#define __ATTRS_o __attribute__((__overloadable__))
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#define __constant(PARM) \
+  __attribute__((__enable_if__ ((PARM) == (PARM), \
+     "argument must be a constant integer")))
+#define __constant_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH), \
+     "argument must be a constant integer from " #LOW " to " #HIGH)))
+#define __constant_pow2_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH) && \
+                                ((PARM) & ((PARM) - 1)) == 0, \
+     "argument must be a constant power of 2 from " #LOW " to " #HIGH)))
+
+/*-- __lcbb -----------------------------------------------------------------*/
+
+extern __ATTRS_o unsigned int
+__lcbb(const void *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define __lcbb(X, Y) ((__typeof__((__lcbb)((X), (Y)))) \
+  __builtin_s390_lcbb((X), __builtin_constant_p((Y))? \
+                           ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : 0) : 0))
+
+/*-- vec_extract ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai signed char
+vec_extract(vector signed char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector bool char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector unsigned char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai signed short
+vec_extract(vector signed short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector bool short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector unsigned short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai signed int
+vec_extract(vector signed int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector bool int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector unsigned int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai signed long long
+vec_extract(vector signed long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector bool long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector unsigned long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai double
+vec_extract(vector double __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+/*-- vec_insert -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert(signed char __scalar, vector signed char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
+  vector unsigned char __newvec = (vector unsigned char)__vec;
+  __newvec[__index & 15] = (unsigned char)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector unsigned char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert(signed short __scalar, vector signed short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
+  vector unsigned short __newvec = (vector unsigned short)__vec;
+  __newvec[__index & 7] = (unsigned short)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector unsigned short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert(signed int __scalar, vector signed int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
+  vector unsigned int __newvec = (vector unsigned int)__vec;
+  __newvec[__index & 3] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector unsigned int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert(signed long long __scalar, vector signed long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector bool long long __vec,
+           int __index) {
+  vector unsigned long long __newvec = (vector unsigned long long)__vec;
+  __newvec[__index & 1] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert(double __scalar, vector double __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_promote ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_promote(signed char __scalar, int __index) {
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_promote(unsigned char __scalar, int __index) {
+  const vector unsigned char __zero = (vector unsigned char)0;
+  vector unsigned char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_promote(signed short __scalar, int __index) {
+  const vector signed short __zero = (vector signed short)0;
+  vector signed short __vec = __builtin_shufflevector(__zero, __zero,
+                                -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_promote(unsigned short __scalar, int __index) {
+  const vector unsigned short __zero = (vector unsigned short)0;
+  vector unsigned short __vec = __builtin_shufflevector(__zero, __zero,
+                                  -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_promote(signed int __scalar, int __index) {
+  const vector signed int __zero = (vector signed int)0;
+  vector signed int __vec = __builtin_shufflevector(__zero, __zero,
+                                                    -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_promote(unsigned int __scalar, int __index) {
+  const vector unsigned int __zero = (vector unsigned int)0;
+  vector unsigned int __vec = __builtin_shufflevector(__zero, __zero,
+                                                      -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_promote(signed long long __scalar, int __index) {
+  const vector signed long long __zero = (vector signed long long)0;
+  vector signed long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                          -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_promote(unsigned long long __scalar, int __index) {
+  const vector unsigned long long __zero = (vector unsigned long long)0;
+  vector unsigned long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                            -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_promote(double __scalar, int __index) {
+  const vector double __zero = (vector double)0;
+  vector double __vec = __builtin_shufflevector(__zero, __zero, -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_insert_and_zero ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert_and_zero(const signed char *__ptr) {
+  vector signed char __vec = (vector signed char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert_and_zero(const unsigned char *__ptr) {
+  vector unsigned char __vec = (vector unsigned char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert_and_zero(const signed short *__ptr) {
+  vector signed short __vec = (vector signed short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert_and_zero(const unsigned short *__ptr) {
+  vector unsigned short __vec = (vector unsigned short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert_and_zero(const signed int *__ptr) {
+  vector signed int __vec = (vector signed int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert_and_zero(const unsigned int *__ptr) {
+  vector unsigned int __vec = (vector unsigned int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert_and_zero(const signed long long *__ptr) {
+  vector signed long long __vec = (vector signed long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert_and_zero(const unsigned long long *__ptr) {
+  vector unsigned long long __vec = (vector unsigned long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert_and_zero(const double *__ptr) {
+  vector double __vec = (vector double)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+/*-- vec_perm ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_perm(vector signed char __a, vector signed char __b,
+         vector unsigned char __c) {
+  return (vector signed char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (vector unsigned char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_perm(vector bool char __a, vector bool char __b,
+         vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_perm(vector signed short __a, vector signed short __b,
+         vector unsigned char __c) {
+  return (vector signed short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+  return (vector unsigned short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_perm(vector bool short __a, vector bool short __b,
+         vector unsigned char __c) {
+  return (vector bool short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_perm(vector signed int __a, vector signed int __b,
+         vector unsigned char __c) {
+  return (vector signed int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+  return (vector unsigned int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_perm(vector bool int __a, vector bool int __b,
+         vector unsigned char __c) {
+  return (vector bool int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+  return (vector signed long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+  return (vector unsigned long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+  return (vector bool long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_perm(vector double __a, vector double __b,
+         vector unsigned char __c) {
+  return (vector double)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+/*-- vec_permi --------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed long long
+vec_permi(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector bool long long
+vec_permi(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_permi(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_permi(X, Y, Z) ((__typeof__((vec_permi)((X), (Y), (Z)))) \
+  __builtin_s390_vpdi((vector unsigned long long)(X), \
+                      (vector unsigned long long)(Y), \
+                      (((Z) & 2) << 1) | ((Z) & 1)))
+
+/*-- vec_sel ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b,
+        vector unsigned char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return ((vector bool char)__c & __b) | (~(vector bool char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector bool char __c) {
+  return ((vector unsigned char)__c & __b) | (~(vector unsigned char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector unsigned short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector bool short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b,
+        vector unsigned short __c) {
+  return ((vector bool short)__c & __b) | (~(vector bool short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (((vector unsigned short)__c & __b) |
+          (~(vector unsigned short)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b,
+        vector unsigned int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b, vector bool int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return ((vector bool int)__c & __b) | (~(vector bool int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b,
+        vector unsigned int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return ((vector unsigned int)__c & __b) | (~(vector unsigned int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector unsigned long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector bool long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector unsigned long long __c) {
+  return (((vector bool long long)__c & __b) |
+          (~(vector bool long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector bool long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector unsigned long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector bool long long __c) {
+  return (((vector unsigned long long)__c & __b) |
+          (~(vector unsigned long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  return (vector double)((__c & (vector unsigned long long)__b) |
+                         (~__c & (vector unsigned long long)__a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  vector unsigned long long __cc = (vector unsigned long long)__c;
+  return (vector double)((__cc & __bc) | (~__cc & __ac));
+}
+
+/*-- vec_gather_element -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed int
+vec_gather_element(vector signed int __vec, vector unsigned int __offset,
+                   const signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const signed int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_gather_element(vector bool int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gather_element(vector unsigned int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_gather_element(vector signed long long __vec,
+                   vector unsigned long long __offset,
+                   const signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const signed long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_gather_element(vector bool long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gather_element(vector unsigned long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_gather_element(vector double __vec, vector unsigned long long __offset,
+                   const double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const double *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+/*-- vec_scatter_element ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed int __vec, vector unsigned int __offset,
+                    signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(signed int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed long long __vec,
+                    vector unsigned long long __offset,
+                    signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(signed long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector double __vec, vector unsigned long long __offset,
+                    double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(double *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+/*-- vec_xld2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xld2(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xld2(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xld2(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xld2(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xld2(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xld2(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xld2(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xld2(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_xld2(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xlw4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xlw4(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xlw4(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xlw4(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xlw4(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xlw4(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xlw4(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xstd2 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_xstw4 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_load_bndry ---------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_load_bndry(const signed char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned char
+vec_load_bndry(const unsigned char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed short
+vec_load_bndry(const signed short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned short
+vec_load_bndry(const unsigned short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed int
+vec_load_bndry(const signed int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned int
+vec_load_bndry(const unsigned int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed long long
+vec_load_bndry(const signed long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned long long
+vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector double
+vec_load_bndry(const double *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define vec_load_bndry(X, Y) ((__typeof__((vec_load_bndry)((X), (Y)))) \
+  __builtin_s390_vlbb((X), ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : -1)))
+
+/*-- vec_load_len -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_load_len(const signed char *__ptr, unsigned int __len) {
+  return (vector signed char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_load_len(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_load_len(const signed short *__ptr, unsigned int __len) {
+  return (vector signed short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_load_len(const unsigned short *__ptr, unsigned int __len) {
+  return (vector unsigned short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_load_len(const signed int *__ptr, unsigned int __len) {
+  return (vector signed int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_load_len(const unsigned int *__ptr, unsigned int __len) {
+  return (vector unsigned int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_len(const signed long long *__ptr, unsigned int __len) {
+  return (vector signed long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
+  return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_load_len(const double *__ptr, unsigned int __len) {
+  return (vector double)__builtin_s390_vll(__len, __ptr);
+}
+
+/*-- vec_store_len ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed char __vec, signed char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned char __vec, unsigned char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed short __vec, signed short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned short __vec, unsigned short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed int __vec, signed int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned int __vec, unsigned int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed long long __vec, signed long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector double __vec, double *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+/*-- vec_load_pair ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_pair(signed long long __a, signed long long __b) {
+  return (vector signed long long)(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_pair(unsigned long long __a, unsigned long long __b) {
+  return (vector unsigned long long)(__a, __b);
+}
+
+/*-- vec_genmask ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmask(unsigned short __mask)
+  __constant(__mask) {
+  return (vector unsigned char)(
+    __mask & 0x8000 ? 0xff : 0,
+    __mask & 0x4000 ? 0xff : 0,
+    __mask & 0x2000 ? 0xff : 0,
+    __mask & 0x1000 ? 0xff : 0,
+    __mask & 0x0800 ? 0xff : 0,
+    __mask & 0x0400 ? 0xff : 0,
+    __mask & 0x0200 ? 0xff : 0,
+    __mask & 0x0100 ? 0xff : 0,
+    __mask & 0x0080 ? 0xff : 0,
+    __mask & 0x0040 ? 0xff : 0,
+    __mask & 0x0020 ? 0xff : 0,
+    __mask & 0x0010 ? 0xff : 0,
+    __mask & 0x0008 ? 0xff : 0,
+    __mask & 0x0004 ? 0xff : 0,
+    __mask & 0x0002 ? 0xff : 0,
+    __mask & 0x0001 ? 0xff : 0);
+}
+
+/*-- vec_genmasks_* ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmasks_8(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 7;
+  unsigned char __bit2 = __last & 7;
+  unsigned char __mask1 = (unsigned char)(1U << (7 - __bit1) << 1) - 1;
+  unsigned char __mask2 = (unsigned char)(1U << (7 - __bit2)) - 1;
+  unsigned char __value = (__bit1 <= __bit2 ?
+                           __mask1 & ~__mask2 :
+                           __mask1 | ~__mask2);
+  return (vector unsigned char)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_genmasks_16(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 15;
+  unsigned char __bit2 = __last & 15;
+  unsigned short __mask1 = (unsigned short)(1U << (15 - __bit1) << 1) - 1;
+  unsigned short __mask2 = (unsigned short)(1U << (15 - __bit2)) - 1;
+  unsigned short __value = (__bit1 <= __bit2 ?
+                            __mask1 & ~__mask2 :
+                            __mask1 | ~__mask2);
+  return (vector unsigned short)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_genmasks_32(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 31;
+  unsigned char __bit2 = __last & 31;
+  unsigned int __mask1 = (1U << (31 - __bit1) << 1) - 1;
+  unsigned int __mask2 = (1U << (31 - __bit2)) - 1;
+  unsigned int __value = (__bit1 <= __bit2 ?
+                          __mask1 & ~__mask2 :
+                          __mask1 | ~__mask2);
+  return (vector unsigned int)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_genmasks_64(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 63;
+  unsigned char __bit2 = __last & 63;
+  unsigned long long __mask1 = (1ULL << (63 - __bit1) << 1) - 1;
+  unsigned long long __mask2 = (1ULL << (63 - __bit2)) - 1;
+  unsigned long long __value = (__bit1 <= __bit2 ?
+                                __mask1 & ~__mask2 :
+                                __mask1 | ~__mask2);
+  return (vector unsigned long long)__value;
+}
+
+/*-- vec_splat --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splat(vector signed char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector signed char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_splat(vector bool char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector bool char)(vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splat(vector unsigned char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splat(vector signed short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector signed short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_splat(vector bool short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector bool short)(vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splat(vector unsigned short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splat(vector signed int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector signed int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_splat(vector bool int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector bool int)(vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splat(vector unsigned int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splat(vector signed long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector signed long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_splat(vector bool long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector bool long long)(vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splat(vector unsigned long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splat(vector double __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector double)__vec[__index];
+}
+
+/*-- vec_splat_s* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector signed char
+vec_splat_s8(signed char __scalar)
+  __constant(__scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_ai vector signed short
+vec_splat_s16(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_ai vector signed int
+vec_splat_s32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector signed long long
+vec_splat_s64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed long long)(signed long)__scalar;
+}
+
+/*-- vec_splat_u* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_splat_u8(unsigned char __scalar)
+  __constant(__scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned short
+vec_splat_u16(unsigned short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned int
+vec_splat_u32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned long long
+vec_splat_u64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned long long)(signed long long)__scalar;
+}
+
+/*-- vec_splats -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splats(signed char __scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splats(unsigned char __scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splats(signed short __scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splats(unsigned short __scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splats(signed int __scalar) {
+  return (vector signed int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splats(unsigned int __scalar) {
+  return (vector unsigned int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splats(signed long long __scalar) {
+  return (vector signed long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splats(unsigned long long __scalar) {
+  return (vector unsigned long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splats(double __scalar) {
+  return (vector double)__scalar;
+}
+
+/*-- vec_extend_s64 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed char __a) {
+  return (vector signed long long)(__a[7], __a[15]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed short __a) {
+  return (vector signed long long)(__a[3], __a[7]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed int __a) {
+  return (vector signed long long)(__a[1], __a[3]);
+}
+
+/*-- vec_mergeh -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergeh(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergeh(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergeh(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergeh(vector double __a, vector double __b) {
+  return (vector double)(__a[0], __b[0]);
+}
+
+/*-- vec_mergel -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergel(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergel(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergel(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergel(vector double __a, vector double __b) {
+  return (vector double)(__a[1], __b[1]);
+}
+
+/*-- vec_pack ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_pack(vector signed short __a, vector signed short __b) {
+  vector signed char __ac = (vector signed char)__a;
+  vector signed char __bc = (vector signed char)__b;
+  return (vector signed char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_pack(vector bool short __a, vector bool short __b) {
+  vector bool char __ac = (vector bool char)__a;
+  vector bool char __bc = (vector bool char)__b;
+  return (vector bool char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return (vector unsigned char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_pack(vector signed int __a, vector signed int __b) {
+  vector signed short __ac = (vector signed short)__a;
+  vector signed short __bc = (vector signed short)__b;
+  return (vector signed short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_pack(vector bool int __a, vector bool int __b) {
+  vector bool short __ac = (vector bool short)__a;
+  vector bool short __bc = (vector bool short)__b;
+  return (vector bool short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return (vector unsigned short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_pack(vector signed long long __a, vector signed long long __b) {
+  vector signed int __ac = (vector signed int)__a;
+  vector signed int __bc = (vector signed int)__b;
+  return (vector signed int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_pack(vector bool long long __a, vector bool long long __b) {
+  vector bool int __ac = (vector bool int)__a;
+  vector bool int __bc = (vector bool int)__b;
+  return (vector bool int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return (vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+/*-- vec_packs --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vpksh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vpksf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vpksg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packs_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return __builtin_s390_vpkshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return __builtin_s390_vpksfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs_cc(vector signed long long __a, vector signed long long __b,
+             int *__cc) {
+  return __builtin_s390_vpksgs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs_cc(vector unsigned long long __a, vector unsigned long long __b,
+             int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_packsu -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector signed short __a, vector signed short __b) {
+  const vector signed short __zero = (vector signed short)0;
+  return __builtin_s390_vpklsh(
+    (vector unsigned short)(__a >= __zero) & (vector unsigned short)__a,
+    (vector unsigned short)(__b >= __zero) & (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector signed int __a, vector signed int __b) {
+  const vector signed int __zero = (vector signed int)0;
+  return __builtin_s390_vpklsf(
+    (vector unsigned int)(__a >= __zero) & (vector unsigned int)__a,
+    (vector unsigned int)(__b >= __zero) & (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector signed long long __a, vector signed long long __b) {
+  const vector signed long long __zero = (vector signed long long)0;
+  return __builtin_s390_vpklsg(
+    (vector unsigned long long)(__a >= __zero) &
+    (vector unsigned long long)__a,
+    (vector unsigned long long)(__b >= __zero) &
+    (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packsu_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu_cc(vector unsigned long long __a, vector unsigned long long __b,
+              int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_unpackh ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackh(vector signed char __a) {
+  return __builtin_s390_vuphb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackh(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuphb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackh(vector unsigned char __a) {
+  return __builtin_s390_vuplhb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackh(vector signed short __a) {
+  return __builtin_s390_vuphh(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackh(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuphh((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackh(vector unsigned short __a) {
+  return __builtin_s390_vuplhh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackh(vector signed int __a) {
+  return __builtin_s390_vuphf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackh(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuphf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackh(vector unsigned int __a) {
+  return __builtin_s390_vuplhf(__a);
+}
+
+/*-- vec_unpackl ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackl(vector signed char __a) {
+  return __builtin_s390_vuplb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackl(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuplb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackl(vector unsigned char __a) {
+  return __builtin_s390_vupllb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackl(vector signed short __a) {
+  return __builtin_s390_vuplhw(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackl(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuplhw((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackl(vector unsigned short __a) {
+  return __builtin_s390_vupllh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackl(vector signed int __a) {
+  return __builtin_s390_vuplf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackl(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuplf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackl(vector unsigned int __a) {
+  return __builtin_s390_vupllf(__a);
+}
+
+/*-- vec_cmpeq --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+/*-- vec_cmpge --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+/*-- vec_cmpgt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+/*-- vec_cmple --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector double __a, vector double __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+/*-- vec_cmplt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+/*-- vec_all_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_any_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 3;
+}
+
+/*-- vec_any_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_andc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_andc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector bool char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector bool short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_andc(vector bool int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector bool int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+/*-- vec_nor ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_nor(vector bool char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector bool char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector bool char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector bool short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector bool short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nor(vector bool int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector bool int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector bool int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector bool long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector bool long long __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector bool long long __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+/*-- vec_cntlz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector signed char __a) {
+  return __builtin_s390_vclzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_s390_vclzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector signed short __a) {
+  return __builtin_s390_vclzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_s390_vclzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector signed int __a) {
+  return __builtin_s390_vclzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_s390_vclzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector signed long long __a) {
+  return __builtin_s390_vclzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_s390_vclzg(__a);
+}
+
+/*-- vec_cnttz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector signed char __a) {
+  return __builtin_s390_vctzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_s390_vctzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector signed short __a) {
+  return __builtin_s390_vctzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_s390_vctzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector signed int __a) {
+  return __builtin_s390_vctzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_s390_vctzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector signed long long __a) {
+  return __builtin_s390_vctzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_s390_vctzg(__a);
+}
+
+/*-- vec_popcnt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector signed char __a) {
+  return __builtin_s390_vpopctb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_s390_vpopctb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector signed short __a) {
+  return __builtin_s390_vpopcth((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_s390_vpopcth(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector signed int __a) {
+  return __builtin_s390_vpopctf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_s390_vpopctf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector signed long long __a) {
+  return __builtin_s390_vpopctg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_s390_vpopctg(__a);
+}
+
+/*-- vec_rl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_verllvb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_verllvb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_verllvh(
+    (vector unsigned short)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_verllvh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_verllvf(
+    (vector unsigned int)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_verllvf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_verllvg(
+    (vector unsigned long long)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_verllvg(__a, __b);
+}
+
+/*-- vec_rli ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rli(vector signed char __a, unsigned long __b) {
+  return (vector signed char)__builtin_s390_verllb(
+    (vector unsigned char)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rli(vector unsigned char __a, unsigned long __b) {
+  return __builtin_s390_verllb(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rli(vector signed short __a, unsigned long __b) {
+  return (vector signed short)__builtin_s390_verllh(
+    (vector unsigned short)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rli(vector unsigned short __a, unsigned long __b) {
+  return __builtin_s390_verllh(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rli(vector signed int __a, unsigned long __b) {
+  return (vector signed int)__builtin_s390_verllf(
+    (vector unsigned int)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rli(vector unsigned int __a, unsigned long __b) {
+  return __builtin_s390_verllf(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rli(vector signed long long __a, unsigned long __b) {
+  return (vector signed long long)__builtin_s390_verllg(
+    (vector unsigned long long)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rli(vector unsigned long long __a, unsigned long __b) {
+  return __builtin_s390_verllg(__a, (int)__b);
+}
+
+/*-- vec_rl_mask ------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_rl_mask(vector signed char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned char
+vec_rl_mask(vector unsigned char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed short
+vec_rl_mask(vector signed short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned short
+vec_rl_mask(vector unsigned short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed int
+vec_rl_mask(vector signed int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned int
+vec_rl_mask(vector unsigned int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed long long
+vec_rl_mask(vector signed long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned long long
+vec_rl_mask(vector unsigned long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+#define vec_rl_mask(X, Y, Z) ((__typeof__((vec_rl_mask)((X), (Y), (Z)))) \
+  __extension__ ({ \
+    vector unsigned char __res; \
+    vector unsigned char __x = (vector unsigned char)(X); \
+    vector unsigned char __y = (vector unsigned char)(Y); \
+    switch (sizeof ((X)[0])) { \
+    case 1: __res = (vector unsigned char) __builtin_s390_verimb( \
+             (vector unsigned char)__x, (vector unsigned char)__x, \
+             (vector unsigned char)__y, (Z)); break; \
+    case 2: __res = (vector unsigned char) __builtin_s390_verimh( \
+             (vector unsigned short)__x, (vector unsigned short)__x, \
+             (vector unsigned short)__y, (Z)); break; \
+    case 4: __res = (vector unsigned char) __builtin_s390_verimf( \
+             (vector unsigned int)__x, (vector unsigned int)__x, \
+             (vector unsigned int)__y, (Z)); break; \
+    default: __res = (vector unsigned char) __builtin_s390_verimg( \
+             (vector unsigned long long)__x, (vector unsigned long long)__x, \
+             (vector unsigned long long)__y, (Z)); break; \
+    } __res; }))
+
+/*-- vec_sll ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_slb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vslb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vslb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_sld ----------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sld(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned char
+vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed short
+vec_sld(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned short
+vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed int
+vec_sld(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned int
+vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed long long
+vec_sld(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned long long
+vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector double
+vec_sld(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 15);
+
+#define vec_sld(X, Y, Z) ((__typeof__((vec_sld)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z)))
+
+/*-- vec_sldw ---------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sldw(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned char
+vec_sldw(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed short
+vec_sldw(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned short
+vec_sldw(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed int
+vec_sldw(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned int
+vec_sldw(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed long long
+vec_sldw(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_sldw(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_sldw(X, Y, Z) ((__typeof__((vec_sldw)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z) * 4))
+
+/*-- vec_sral ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsra(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srab ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrab(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrab(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srl ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrlb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrlb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_abs ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_abs(vector signed char __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed char)0));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_abs(vector signed short __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed short)0));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_abs(vector signed int __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed int)0));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_abs(vector signed long long __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_abs(vector double __a) {
+  return __builtin_s390_vflpdb(__a);
+}
+
+/*-- vec_nabs ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_nabs(vector double __a) {
+  return __builtin_s390_vflndb(__a);
+}
+
+/*-- vec_max ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector signed char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector signed short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector signed int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_max(vector double __a, vector double __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_min ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector signed char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector signed short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector signed int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_min(vector double __a, vector double __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_add_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_add_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaq(__a, __b);
+}
+
+/*-- vec_addc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_addc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_addc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vacch(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vaccf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_addc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vaccg(__a, __b);
+}
+
+/*-- vec_addc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccq(__a, __b);
+}
+
+/*-- vec_adde_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_adde_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vacq(__a, __b, __c);
+}
+
+/*-- vec_addec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vacccq(__a, __b, __c);
+}
+
+/*-- vec_avg ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vavgb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_avg(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vavgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_avg(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vavgf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_avg(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vavgg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vavglb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vavglh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vavglf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_avg(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vavglg(__a, __b);
+}
+
+/*-- vec_checksum -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned int
+vec_checksum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vcksm(__a, __b);
+}
+
+/*-- vec_gfmsum -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vgfmb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vgfmh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vgfmf(__a, __b);
+}
+
+/*-- vec_gfmsum_128 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vgfmg(__a, __b);
+}
+
+/*-- vec_gfmsum_accum -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum_accum(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned short __c) {
+  return __builtin_s390_vgfmab(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum_accum(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned int __c) {
+  return __builtin_s390_vgfmah(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum_accum(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned long long __c) {
+  return __builtin_s390_vgfmaf(__a, __b, __c);
+}
+
+/*-- vec_gfmsum_accum_128 ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_accum_128(vector unsigned long long __a,
+                     vector unsigned long long __b,
+                     vector unsigned char __c) {
+  return __builtin_s390_vgfmag(__a, __b, __c);
+}
+
+/*-- vec_mladd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector unsigned char __a, vector signed char __b,
+          vector signed char __c) {
+  return (vector signed char)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * (vector signed char)__b + (vector signed char)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mladd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector unsigned short __a, vector signed short __b,
+          vector signed short __c) {
+  return (vector signed short)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * (vector signed short)__b + (vector signed short)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector unsigned int __a, vector signed int __b,
+          vector signed int __c) {
+  return (vector signed int)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * (vector signed int)__b + (vector signed int)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mladd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * __b + __c;
+}
+
+/*-- vec_mhadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mhadd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __builtin_s390_vmahb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mhadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __builtin_s390_vmalhb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mhadd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_s390_vmahh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mhadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalhh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mhadd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __builtin_s390_vmahf(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mhadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmalhf(__a, __b, __c);
+}
+
+/*-- vec_meadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_meadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaeb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_meadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmaleb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_meadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaeh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_meadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaleh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_meadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaef(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_meadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalef(__a, __b, __c);
+}
+
+/*-- vec_moadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_moadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_moadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_moadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaoh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_moadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaloh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_moadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaof(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_moadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalof(__a, __b, __c);
+}
+
+/*-- vec_mulh ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mulh(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mulh(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulh(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmlhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulh(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmhf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulh(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlhf(__a, __b);
+}
+
+/*-- vec_mule ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mule(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmleb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mule(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmleh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mule(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmef(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlef(__a, __b);
+}
+
+/*-- vec_mulo ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulo(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulo(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmoh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmloh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mulo(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmof(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlof(__a, __b);
+}
+
+/*-- vec_sub_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sub_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsq(__a, __b);
+}
+
+/*-- vec_subc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_subc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbib(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_subc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vscbih(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vscbif(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_subc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vscbig(__a, __b);
+}
+
+/*-- vec_subc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbiq(__a, __b);
+}
+
+/*-- vec_sube_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sube_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vsbiq(__a, __b, __c);
+}
+
+/*-- vec_subec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vsbcbiq(__a, __b, __c);
+}
+
+/*-- vec_sum2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumgf(__a, __b);
+}
+
+/*-- vec_sum_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumqf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vsumqg(__a, __b);
+}
+
+/*-- vec_sum4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsumb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumh(__a, __b);
+}
+
+/*-- vec_test_mask ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm(__a, __b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector double __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+/*-- vec_madd ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmadb(__a, __b, __c);
+}
+
+/*-- vec_msub ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmsdb(__a, __b, __c);
+}
+
+/*-- vec_sqrt ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_sqrt(vector double __a) {
+  return __builtin_s390_vfsqdb(__a);
+}
+
+/*-- vec_ld2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ld2f(const float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  return __builtin_convertvector(*(const __v2f32 *)__ptr, vector double);
+}
+
+/*-- vec_st2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai void
+vec_st2f(vector double __a, float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  *(__v2f32 *)__ptr = __builtin_convertvector(__a, __v2f32);
+}
+
+/*-- vec_ctd ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector signed long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector unsigned long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+/*-- vec_ctsl ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_ctsl(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_ctul ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_ctul(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+/*-- vec_roundp -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundp(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_ceil ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ceil(vector double __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_roundm -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundm(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_floor --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_floor(vector double __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_roundz -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundz(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_trunc --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_trunc(vector double __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_roundc -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundc(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 0);
+}
+
+/*-- vec_round --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_round(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 4);
+}
+
+/*-- vec_fp_test_data_class -------------------------------------------------*/
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+
+/*-- vec_cp_until_zero ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero(vector signed char __a) {
+  return (vector signed char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero(vector bool char __a) {
+  return (vector bool char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero(vector unsigned char __a) {
+  return __builtin_s390_vistrb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero(vector signed short __a) {
+  return (vector signed short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero(vector bool short __a) {
+  return (vector bool short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero(vector unsigned short __a) {
+  return __builtin_s390_vistrh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero(vector signed int __a) {
+  return (vector signed int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero(vector bool int __a) {
+  return (vector bool int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero(vector unsigned int __a) {
+  return __builtin_s390_vistrf(__a);
+}
+
+/*-- vec_cp_until_zero_cc ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero_cc(vector signed char __a, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero_cc(vector bool char __a, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero_cc(vector unsigned char __a, int *__cc) {
+  return __builtin_s390_vistrbs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero_cc(vector signed short __a, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero_cc(vector bool short __a, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero_cc(vector unsigned short __a, int *__cc) {
+  return __builtin_s390_vistrhs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero_cc(vector signed int __a, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vistrfs((vector unsigned int)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero_cc(vector bool int __a, int *__cc) {
+  return (vector bool int)__builtin_s390_vistrfs((vector unsigned int)__a,
+                                                 __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero_cc(vector unsigned int __a, int *__cc) {
+  return __builtin_s390_vistrfs(__a, __cc);
+}
+
+/*-- vec_cmpeq_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeeb((vector unsigned char)__a,
+                         (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeeb((vector unsigned char)__a,
+                              (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeeh((vector unsigned short)__a,
+                         (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeeh((vector unsigned short)__a,
+                              (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeef((vector unsigned int)__a,
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeef((vector unsigned int)__a,
+                              (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeef(__a, __b);
+}
+
+/*-- vec_cmpeq_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfeebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfeehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfeefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpeq_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeezb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeezb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeezh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeezh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeezf((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeezf((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeezf(__a, __b);
+}
+
+/*-- vec_cmpeq_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeneb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeneb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeneb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeneh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeneh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeneh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenef((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenef((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenef(__a, __b);
+}
+
+/*-- vec_cmpne_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenebs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenebs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfenebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenehs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenehs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfenehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenefs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenefs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfenefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfenezb((vector unsigned char)__a,
+                           (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfenezb((vector unsigned char)__a,
+                                (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfenezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfenezh((vector unsigned short)__a,
+                           (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfenezh((vector unsigned short)__a,
+                                (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfenezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenezf((vector unsigned int)__a,
+                           (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenezf((vector unsigned int)__a,
+                                (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenezf(__a, __b);
+}
+
+/*-- vec_cmpne_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenezbs((vector unsigned char)__a,
+                            (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenezbs((vector unsigned char)__a,
+                                 (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenezhs((vector unsigned short)__a,
+                            (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenezhs((vector unsigned short)__a,
+                                 (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenezfs((vector unsigned int)__a,
+                            (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenezfs((vector unsigned int)__a,
+                                 (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmprg --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 4);
+}
+
+/*-- vec_cmprg_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg_cc(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg_cc(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg_cc(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 4, __cc);
+}
+
+/*-- vec_cmprg_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmprg_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                   vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                   vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                   vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmpnrg -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg(vector unsigned char __a, vector unsigned char __b,
+           vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg(vector unsigned short __a, vector unsigned short __b,
+           vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg(vector unsigned int __a, vector unsigned int __b,
+           vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 12);
+}
+
+/*-- vec_cmpnrg_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg_cc(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg_cc(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg_cc(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 12, __cc);
+}
+
+/*-- vec_cmpnrg_idx ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx(vector unsigned short __a, vector unsigned short __b,
+               vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx(vector unsigned int __a, vector unsigned int __b,
+               vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_idx_cc ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                  vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                  vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                  vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_cmpnrg_or_0_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                    vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                    vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                    vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_or_0_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_find_any_eq --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 4);
+}
+
+/*-- vec_find_any_eq_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 4, __cc);
+}
+
+/*-- vec_find_any_eq_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_eq_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_ne --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 12);
+}
+
+/*-- vec_find_any_ne_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 12, __cc);
+}
+
+/*-- vec_find_any_ne_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 8, __cc);
+}
+
+/*-- vec_find_any_ne_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 8, __cc);
+}
+
+#undef __constant_pow2_range
+#undef __constant_range
+#undef __constant
+#undef __ATTRS_o
+#undef __ATTRS_o_ai
+#undef __ATTRS_ai
+
+#else
+
+#error "Use -fzvector to enable vector extensions"
+
+#endif

diff --git a/25.0.2/clang-include/wmmintrin.h b/25.0.2/clang-include/wmmintrin.h
new file mode 100644
index 0000000..a2d9310
--- /dev/null
+++ b/25.0.2/clang-include/wmmintrin.h

@@ -0,0 +1,33 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#include <__wmmintrin_aes.h>
+
+#include <__wmmintrin_pclmul.h>
+
+#endif /* _WMMINTRIN_H */

diff --git a/25.0.2/clang-include/x86intrin.h b/25.0.2/clang-include/x86intrin.h
new file mode 100644
index 0000000..4d8077e
--- /dev/null
+++ b/25.0.2/clang-include/x86intrin.h

@@ -0,0 +1,57 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#include <mm3dnow.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <lzcntintrin.h>
+
+#include <popcntintrin.h>
+
+#include <rdseedintrin.h>
+
+#include <prfchwintrin.h>
+
+#include <ammintrin.h>
+
+#include <fma4intrin.h>
+
+#include <xopintrin.h>
+
+#include <tbmintrin.h>
+
+#include <f16cintrin.h>
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */

diff --git a/25.0.2/clang-include/xmmintrin.h b/25.0.2/clang-include/xmmintrin.h
new file mode 100644
index 0000000..ae0b2cd
--- /dev/null
+++ b/25.0.2/clang-include/xmmintrin.h

@@ -0,0 +1,1010 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return __a + __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return __a - __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return __a * __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return __a / __b;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss(__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4si)__a & (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a | (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4si)__a ^ (__v4si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpless(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpltss(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpless(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnless(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpnltss(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector(__a,
+                                         __builtin_ia32_cmpnless(__b, __a),
+                                         4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps(__b, __a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordps(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordss(__a, __b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordps(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq(__a, __b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64(__a);
+}
+
+#endif
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi(__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_si32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_si64(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi(__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_undefined_ps()
+{
+  return (__m128)__builtin_ia32_undef128();
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_storeups(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0);
+  _mm_storeu_ps(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps1(float *__p, __m128 __a)
+{
+    return _mm_store1_ps(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128 *)__p = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntps(__p, __a);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_sfence(void)
+{
+  __builtin_ia32_sfence();
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_extract_pi16(__m64 __a, int __n)
+{
+  __v4hi __b = (__v4hi)__a;
+  return (unsigned short)__b[__n & 3];
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_insert_pi16(__m64 __a, int __d, int __n)
+{
+   __v4hi __b = (__v4hi)__a;
+   __b[__n & 3] = __d;
+   return (__m64)__b;
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_getcsr(void)
+{
+  return __builtin_ia32_stmxcsr();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_setcsr(unsigned int __i)
+{
+  __builtin_ia32_ldmxcsr(__i);
+}
+
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                  (mask) & 0x3, ((mask) & 0xc) >> 2, \
+                                  (((mask) & 0x30) >> 4) + 4, \
+                                  (((mask) & 0xc0) >> 6) + 4); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+
+  return _mm_packs_pi32(__b, __c);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+
+  return _mm_packs_pi16(__b, __c);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps(__a);
+}
+
+
+#ifdef _MSC_VER
+#define _MM_ALIGN16 __declspec(align(16))
+#endif
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#if defined(__SSE2__) && !__has_feature(modules)
+#include <emmintrin.h>
+#endif
+
+#endif /* __XMMINTRIN_H */

diff --git a/25.0.2/clang-include/xopintrin.h b/25.0.2/clang-include/xopintrin.h
new file mode 100644
index 0000000..f07f51c
--- /dev/null
+++ b/25.0.2/clang-include/xopintrin.h

@@ -0,0 +1,782 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#include <fma4intrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                 (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                 (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                 (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                 (__v2di)(__m128i)(B), (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+                                     (__v2df)(__m128d)(Y), \
+                                     (__v2di)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+                                        (__v4df)(__m256d)(Y), \
+                                        (__v4di)(__m256i)(C), (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                    (__v4si)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+                                       (__v8sf)(__m256)(Y), \
+                                       (__v8si)(__m256i)(C), (I)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __XOPINTRIN_H */

diff --git a/25.0.2/clang-include/xsavecintrin.h b/25.0.2/clang-include/xsavecintrin.h
new file mode 100644
index 0000000..598470a
--- /dev/null
+++ b/25.0.2/clang-include/xsavecintrin.h

@@ -0,0 +1,48 @@
+/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVECINTRIN_H
+#define __XSAVECINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsavec")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/xsaveintrin.h b/25.0.2/clang-include/xsaveintrin.h
new file mode 100644
index 0000000..a2e6b2e
--- /dev/null
+++ b/25.0.2/clang-include/xsaveintrin.h

@@ -0,0 +1,58 @@
+/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEINTRIN_H
+#define __XSAVEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/xsaveoptintrin.h b/25.0.2/clang-include/xsaveoptintrin.h
new file mode 100644
index 0000000..d3faae7
--- /dev/null
+++ b/25.0.2/clang-include/xsaveoptintrin.h

@@ -0,0 +1,48 @@
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEOPTINTRIN_H
+#define __XSAVEOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaveopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/xsavesintrin.h b/25.0.2/clang-include/xsavesintrin.h
new file mode 100644
index 0000000..c5e540a
--- /dev/null
+++ b/25.0.2/clang-include/xsavesintrin.h

@@ -0,0 +1,58 @@
+/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVESINTRIN_H
+#define __XSAVESINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaves")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

diff --git a/25.0.2/clang-include/xtestintrin.h b/25.0.2/clang-include/xtestintrin.h
new file mode 100644
index 0000000..9d3378f
--- /dev/null
+++ b/25.0.2/clang-include/xtestintrin.h

@@ -0,0 +1,41 @@
+/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XTESTINTRIN_H
+#define __XTESTINTRIN_H
+
+/* xtest returns non-zero if the instruction is executed within an RTM or active
+ * HLE region. */
+/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
+ * supported. */
+static __inline__ int
+    __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+    _xtest(void) {
+  return __builtin_ia32_xtest();
+}
+
+#endif

diff --git a/25.0.2/include/rs_allocation_create.rsh b/25.0.2/include/rs_allocation_create.rsh
new file mode 100644
index 0000000..d7f9fd6
--- /dev/null
+++ b/25.0.2/include/rs_allocation_create.rsh

@@ -0,0 +1,1345 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_allocation_create.rsh: Allocation Creation Functions
+ *
+ * The functions below can be used to create Allocations from a Script.
+ *
+ * These functions can be called directly or indirectly from an invokable
+ * function.  If some control-flow path can result in a call to these functions
+ * from a RenderScript kernel function, a compiler error will be generated.
+ */
+
+#ifndef RENDERSCRIPT_RS_ALLOCATION_CREATE_RSH
+#define RENDERSCRIPT_RS_ALLOCATION_CREATE_RSH
+
+/*
+ * rsCreateElement: Creates an rs_element object of the specified data type
+ *
+ *  Creates an rs_element object of the specified data type.  The data kind of
+ *  the Element will be set to RS_KIND_USER and vector_width will be set to 1,
+ *  indicating non-vector.
+ *
+ * Parameters:
+ *   data_type: Data type of the Element
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_element __attribute__((overloadable))
+    rsCreateElement(rs_data_type data_type);
+#endif
+
+/*
+ * rsCreateVectorElement: Creates an rs_element object of the specified data type and vector width
+ *
+ *  Creates an rs_element object of the specified data type and vector width.
+ *  Value of vector_width must be 2, 3 or 4.  The data kind of the Element will
+ *  be set to RS_KIND_USER.
+ *
+ * Parameters:
+ *   data_type: Data type of the Element
+ *   vector_width: Vector width (either 2, 3, or 4)
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_element __attribute__((overloadable))
+    rsCreateVectorElement(rs_data_type data_type, uint32_t vector_width);
+#endif
+
+/*
+ * rsCreatePixelElement: Creates an rs_element object of the specified data type and data kind
+ *
+ *  Creates an rs_element object of the specified data type and data kind.  The
+ *  vector_width of the Element will be set to 1, indicating non-vector.
+ *
+ * Parameters:
+ *   data_type: Data type of the Element
+ *   data_kind: Data kind of the Element
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_element __attribute__((overloadable))
+    rsCreatePixelElement(rs_data_type data_type, rs_data_kind data_kind);
+#endif
+
+/*
+ * rsCreateType: Creates an rs_type object with the specified Element and shape attributes
+ *
+ *  Creates an rs_type object with the specified Element and shape attributes.
+ *
+ *  dimX specifies the size of the X dimension.
+ *
+ *  dimY, if present and non-zero, indicates that the Y dimension is present and
+ *  indicates its size.
+ *
+ *  dimZ, if present and non-zero, indicates that the Z dimension is present and
+ *  indicates its size.
+ *
+ *  mipmaps indicates the presence of level of detail (LOD).
+ *
+ *  faces indicates the  presence of cubemap faces.
+ *
+ *  yuv_format indicates the associated YUV format (or RS_YUV_NONE).
+ *
+ * Parameters:
+ *   element: Element to be associated with the Type
+ *   dimX: Size along the X dimension
+ *   dimY: Size along the Y dimension
+ *   dimZ: Size along the Z dimension
+ *   mipmaps: Flag indicating if the Type has a mipmap chain
+ *   faces: Flag indicating if the Type is a cubemap
+ *   yuv_format: YUV layout for the Type
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX, uint32_t dimY, uint32_t dimZ, bool mipmaps,
+                 bool faces, rs_yuv_format yuv_format);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX, uint32_t dimY, uint32_t dimZ);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX, uint32_t dimY);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_type __attribute__((overloadable))
+    rsCreateType(rs_element element, uint32_t dimX);
+#endif
+
+/*
+ * rsCreateAllocation: Create an rs_allocation object of given Type.
+ *
+ *  Creates an rs_allocation object of the given Type and usage.
+ *
+ *  RS_ALLOCATION_USAGE_SCRIPT and RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE are the
+ *  only supported usage flags for Allocations created from within a RenderScript
+ *  Script.
+ *
+ *  You can also use rsCreateAllocation_ wrapper functions to directly
+ *  create Allocations of scalar and vector numerical types without creating
+ *  intermediate rs_element or rs_type objects.
+ *
+ *  E.g. rsCreateAllocation_int4() returns an Allocation of int4 data type of
+ *  specified dimensions.
+ *
+ * Parameters:
+ *   type: Type of the Allocation
+ *   usage: Usage flag for the allocation
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_allocation __attribute__((overloadable))
+    rsCreateAllocation(rs_type type, uint32_t usage);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern rs_allocation __attribute__((overloadable))
+    rsCreateAllocation(rs_type type);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_16);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_32);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_64);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong2(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong3(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong4(uint32_t dimX, uint32_t dimY, uint32_t dimZ) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY, dimZ);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_16);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_32);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_64);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_8);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_16);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_32);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_64);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong2(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong3(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong4(uint32_t dimX, uint32_t dimY) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX, dimY);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_16);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_32);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_FLOAT_64);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_8);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_8);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_16);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_16);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_32);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_32);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_SIGNED_64);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong(uint32_t dimX) {
+     rs_element e = rsCreateElement(RS_TYPE_UNSIGNED_64);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_half4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_16, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_float4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_32, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_double4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_FLOAT_64, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_char4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uchar4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_8, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_short4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ushort4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_16, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_int4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_uint4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_32, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_long4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_SIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong2(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 2);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong3(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 3);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+static inline rs_allocation __attribute__((overloadable))
+    rsCreateAllocation_ulong4(uint32_t dimX) {
+     rs_element e = rsCreateVectorElement(RS_TYPE_UNSIGNED_64, 4);
+     rs_type t = rsCreateType(e, dimX);
+     return rsCreateAllocation(t);
+}
+#endif
+
+#endif // RENDERSCRIPT_RS_ALLOCATION_CREATE_RSH

diff --git a/25.0.2/include/rs_allocation_data.rsh b/25.0.2/include/rs_allocation_data.rsh
new file mode 100644
index 0000000..ea16767
--- /dev/null
+++ b/25.0.2/include/rs_allocation_data.rsh

@@ -0,0 +1,3365 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_allocation_data.rsh: Allocation Data Access Functions
+ *
+ * The functions below can be used to get and set the cells that comprise
+ * an allocation.
+ *
+ * - Individual cells are accessed using the rsGetElementAt* and
+ *   rsSetElementAt functions.
+ * - Multiple cells can be copied using the rsAllocationCopy* and
+ *   rsAllocationV* functions.
+ * - For getting values through a sampler, use rsSample.
+ *
+ * The rsGetElementAt and rsSetElement* functions are somewhat misnamed.
+ * They don't get or set elements, which are akin to data types; they get
+ * or set cells.  Think of them as rsGetCellAt and and rsSetCellAt.
+ */
+
+#ifndef RENDERSCRIPT_RS_ALLOCATION_DATA_RSH
+#define RENDERSCRIPT_RS_ALLOCATION_DATA_RSH
+
+/*
+ * rsAllocationCopy1DRange: Copy consecutive cells between allocations
+ *
+ * Copies the specified number of cells from one allocation to another.
+ *
+ * The two allocations must be different.  Using this function to copy whithin
+ * the same allocation yields undefined results.
+ *
+ * The function does not validate whether the offset plus count exceeds the size
+ * of either allocation.  Be careful!
+ *
+ * This function should only be called between 1D allocations.  Calling it
+ * on other allocations is undefined.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   dstAlloc: Allocation to copy cells into.
+ *   dstOff: Offset in the destination of the first cell to be copied into.
+ *   dstMip: Mip level in the destination allocation.  0 if mip mapping is not used.
+ *   count: Number of cells to be copied.
+ *   srcAlloc: Source allocation.
+ *   srcOff: Offset in the source of the first cell to be copied.
+ *   srcMip: Mip level in the source allocation.  0 if mip mapping is not used.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsAllocationCopy1DRange(rs_allocation dstAlloc, uint32_t dstOff, uint32_t dstMip, uint32_t count,
+                            rs_allocation srcAlloc, uint32_t srcOff, uint32_t srcMip);
+#endif
+
+/*
+ * rsAllocationCopy2DRange: Copy a rectangular region of cells between allocations
+ *
+ * Copies a rectangular region of cells from one allocation to another.
+ * (width * heigth) cells are copied.
+ *
+ * The two allocations must be different.  Using this function to copy whithin
+ * the same allocation yields undefined results.
+ *
+ * The function does not validate whether the the source or destination region
+ * exceeds the size of its respective allocation.  Be careful!
+ *
+ * This function should only be called between 2D allocations.  Calling it
+ * on other allocations is undefined.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   dstAlloc: Allocation to copy cells into.
+ *   dstXoff: X offset in the destination of the region to be set.
+ *   dstYoff: Y offset in the destination of the region to be set.
+ *   dstMip: Mip level in the destination allocation.  0 if mip mapping is not used.
+ *   dstFace: Cubemap face of the destination allocation.  Ignored for allocations that aren't cubemaps.
+ *   width: Width of the incoming region to update.
+ *   height: Height of the incoming region to update.
+ *   srcAlloc: Source allocation.
+ *   srcXoff: X offset in the source.
+ *   srcYoff: Y offset in the source.
+ *   srcMip: Mip level in the source allocation.  0 if mip mapping is not used.
+ *   srcFace: Cubemap face of the source allocation.  Ignored for allocations that aren't cubemaps.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern void __attribute__((overloadable))
+    rsAllocationCopy2DRange(rs_allocation dstAlloc, uint32_t dstXoff, uint32_t dstYoff,
+                            uint32_t dstMip, rs_allocation_cubemap_face dstFace, uint32_t width,
+                            uint32_t height, rs_allocation srcAlloc, uint32_t srcXoff,
+                            uint32_t srcYoff, uint32_t srcMip, rs_allocation_cubemap_face srcFace);
+#endif
+
+/*
+ * rsAllocationVLoadX: Get a vector from an allocation of scalars
+ *
+ * This function returns a vector composed of successive cells of the allocation.
+ * It assumes that the allocation contains scalars.
+ *
+ * The "X" in the name indicates that successive values are extracted by
+ * increasing the X index.  There are currently no functions to get successive
+ * values incrementing other dimensions.  Use multiple calls to rsGetElementAt()
+ * instead.
+ *
+ * For example, when calling rsAllocationVLoadX_int4(a, 20, 30), an int4 composed
+ * of a[20, 30], a[21, 30], a[22, 30], and a[23, 30] is returned.
+ *
+ * When retrieving from a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * For efficiency, this function does not validate the inputs.  Trying to wrap
+ * the X index, exceeding the size of the allocation, or using indices incompatible
+ * with the dimensionality of the allocation yields undefined results.
+ *
+ * See also rsAllocationVStoreX().
+ *
+ * Parameters:
+ *   a: Allocation to get the data from.
+ *   x: X offset in the allocation of the first cell to be copied from.
+ *   y: Y offset in the allocation of the first cell to be copied from.
+ *   z: Z offset in the allocation of the first cell to be copied from.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float2 __attribute__((overloadable))
+    rsAllocationVLoadX_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float3 __attribute__((overloadable))
+    rsAllocationVLoadX_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern float4 __attribute__((overloadable))
+    rsAllocationVLoadX_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double2 __attribute__((overloadable))
+    rsAllocationVLoadX_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double3 __attribute__((overloadable))
+    rsAllocationVLoadX_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern double4 __attribute__((overloadable))
+    rsAllocationVLoadX_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char2 __attribute__((overloadable))
+    rsAllocationVLoadX_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char3 __attribute__((overloadable))
+    rsAllocationVLoadX_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern char4 __attribute__((overloadable))
+    rsAllocationVLoadX_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar2 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar3 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uchar4 __attribute__((overloadable))
+    rsAllocationVLoadX_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short2 __attribute__((overloadable))
+    rsAllocationVLoadX_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short3 __attribute__((overloadable))
+    rsAllocationVLoadX_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern short4 __attribute__((overloadable))
+    rsAllocationVLoadX_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort2 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort3 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ushort4 __attribute__((overloadable))
+    rsAllocationVLoadX_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int2 __attribute__((overloadable))
+    rsAllocationVLoadX_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int3 __attribute__((overloadable))
+    rsAllocationVLoadX_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern int4 __attribute__((overloadable))
+    rsAllocationVLoadX_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint2 __attribute__((overloadable))
+    rsAllocationVLoadX_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint3 __attribute__((overloadable))
+    rsAllocationVLoadX_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern uint4 __attribute__((overloadable))
+    rsAllocationVLoadX_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long2 __attribute__((overloadable))
+    rsAllocationVLoadX_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long3 __attribute__((overloadable))
+    rsAllocationVLoadX_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern long4 __attribute__((overloadable))
+    rsAllocationVLoadX_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong2 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong3 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern ulong4 __attribute__((overloadable))
+    rsAllocationVLoadX_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsAllocationVStoreX: Store a vector into an allocation of scalars
+ *
+ * This function stores the entries of a vector into successive cells of an allocation.
+ * It assumes that the allocation contains scalars.
+ *
+ * The "X" in the name indicates that successive values are stored by increasing
+ * the X index.  There are currently no functions to store successive values
+ * incrementing other dimensions.  Use multiple calls to rsSetElementAt() instead.
+ *
+ * For example, when calling rsAllocationVStoreX_int3(a, v, 20, 30), v.x is stored
+ * at a[20, 30], v.y at a[21, 30], and v.z at a[22, 30].
+ *
+ * When storing into a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * For efficiency, this function does not validate the inputs.  Trying to wrap the
+ * X index, exceeding the size of the allocation, or using indices incompatible
+ * with the dimensionality of the allocation yiels undefined results.
+ *
+ * See also rsAllocationVLoadX().
+ *
+ * Parameters:
+ *   a: Allocation to store the data into.
+ *   val: Value to be stored.
+ *   x: X offset in the allocation of the first cell to be copied into.
+ *   y: Y offset in the allocation of the first cell to be copied into.
+ *   z: Z offset in the allocation of the first cell to be copied into.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+extern void __attribute__((overloadable))
+    rsAllocationVStoreX_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsGetElementAt: Return a cell from an allocation
+ *
+ * This function extracts a single cell from an allocation.
+ *
+ * When retrieving from a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for the
+ * mono dimensional allocations.
+ *
+ * This function has two styles.  One returns the address of the value using a void*,
+ * the other returns the actual value, e.g. rsGetElementAt() vs. rsGetElementAt_int4().
+ * For primitive types, always use the latter as it is more efficient.
+ */
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x);
+
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y);
+
+extern const void* __attribute__((overloadable))
+    rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x) {
+    return ((float *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x) {
+    return ((float2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x) {
+    return ((float3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x) {
+    return ((float4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x) {
+    return ((double *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x) {
+    return ((double2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x) {
+    return ((double3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x) {
+    return ((double4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x) {
+    return ((char *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x) {
+    return ((char2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x) {
+    return ((char3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x) {
+    return ((char4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x) {
+    return ((uchar *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x) {
+    return ((uchar2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x) {
+    return ((uchar3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x) {
+    return ((uchar4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x) {
+    return ((short *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x) {
+    return ((short2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x) {
+    return ((short3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x) {
+    return ((short4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x) {
+    return ((ushort *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x) {
+    return ((ushort2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x) {
+    return ((ushort3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x) {
+    return ((ushort4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x) {
+    return ((int *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x) {
+    return ((int2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x) {
+    return ((int3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x) {
+    return ((int4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x) {
+    return ((uint *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x) {
+    return ((uint2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x) {
+    return ((uint3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x) {
+    return ((uint4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x) {
+    return ((long *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x) {
+    return ((long2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x) {
+    return ((long3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x) {
+    return ((long4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x) {
+    return ((ulong *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x) {
+    return ((ulong2 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x) {
+    return ((ulong3 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x) {
+    return ((ulong4 *)rsGetElementAt(a, x))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((float4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((double4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((char4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uchar4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((short4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ushort4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((int4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((uint4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((long4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong2 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong3 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y) {
+    return ((ulong4 *)rsGetElementAt(a, x, y))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((float4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((double4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((char4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uchar4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((short4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ushort4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((int4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((uint4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((long4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong2 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong3 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 17)
+static inline ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
+    return ((ulong4 *)rsGetElementAt(a, x, y, z))[0];
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((overloadable))
+    rsGetElementAt_float(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((overloadable))
+    rsGetElementAt_float2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((overloadable))
+    rsGetElementAt_float3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((overloadable))
+    rsGetElementAt_float4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double __attribute__((overloadable))
+    rsGetElementAt_double(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double2 __attribute__((overloadable))
+    rsGetElementAt_double2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double3 __attribute__((overloadable))
+    rsGetElementAt_double3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern double4 __attribute__((overloadable))
+    rsGetElementAt_double4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char __attribute__((overloadable))
+    rsGetElementAt_char(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char2 __attribute__((overloadable))
+    rsGetElementAt_char2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char3 __attribute__((overloadable))
+    rsGetElementAt_char3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern char4 __attribute__((overloadable))
+    rsGetElementAt_char4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAt_uchar(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar2 __attribute__((overloadable))
+    rsGetElementAt_uchar2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar3 __attribute__((overloadable))
+    rsGetElementAt_uchar3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar4 __attribute__((overloadable))
+    rsGetElementAt_uchar4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short __attribute__((overloadable))
+    rsGetElementAt_short(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short2 __attribute__((overloadable))
+    rsGetElementAt_short2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short3 __attribute__((overloadable))
+    rsGetElementAt_short3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern short4 __attribute__((overloadable))
+    rsGetElementAt_short4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort __attribute__((overloadable))
+    rsGetElementAt_ushort(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort2 __attribute__((overloadable))
+    rsGetElementAt_ushort2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort3 __attribute__((overloadable))
+    rsGetElementAt_ushort3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ushort4 __attribute__((overloadable))
+    rsGetElementAt_ushort4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int __attribute__((overloadable))
+    rsGetElementAt_int(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int2 __attribute__((overloadable))
+    rsGetElementAt_int2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int3 __attribute__((overloadable))
+    rsGetElementAt_int3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern int4 __attribute__((overloadable))
+    rsGetElementAt_int4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint __attribute__((overloadable))
+    rsGetElementAt_uint(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint2 __attribute__((overloadable))
+    rsGetElementAt_uint2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint3 __attribute__((overloadable))
+    rsGetElementAt_uint3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uint4 __attribute__((overloadable))
+    rsGetElementAt_uint4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long __attribute__((overloadable))
+    rsGetElementAt_long(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long2 __attribute__((overloadable))
+    rsGetElementAt_long2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long3 __attribute__((overloadable))
+    rsGetElementAt_long3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern long4 __attribute__((overloadable))
+    rsGetElementAt_long4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong __attribute__((overloadable))
+    rsGetElementAt_ulong(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong2 __attribute__((overloadable))
+    rsGetElementAt_ulong2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong3 __attribute__((overloadable))
+    rsGetElementAt_ulong3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern ulong4 __attribute__((overloadable))
+    rsGetElementAt_ulong4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half __attribute__((overloadable))
+    rsGetElementAt_half(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half2 __attribute__((overloadable))
+    rsGetElementAt_half2(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half3 __attribute__((overloadable))
+    rsGetElementAt_half3(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern half4 __attribute__((overloadable))
+    rsGetElementAt_half4(rs_allocation a, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_U: Get the U component of an allocation of YUVs
+ *
+ * Extracts the U component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_Y().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_V: Get the V component of an allocation of YUVs
+ *
+ * Extracts the V component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_Y().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsGetElementAtYuv_uchar_Y: Get the Y component of an allocation of YUVs
+ *
+ * Extracts the Y component of a single YUV value from a 2D allocation of YUVs.
+ *
+ * Inside an allocation, Y, U, and V components may be stored if different planes
+ * and at different resolutions.  The x, y coordinates provided here are in the
+ * dimensions of the Y plane.
+ *
+ * See rsGetElementAtYuv_uchar_U() and rsGetElementAtYuv_uchar_V().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern uchar __attribute__((overloadable))
+    rsGetElementAtYuv_uchar_Y(rs_allocation a, uint32_t x, uint32_t y);
+#endif
+
+/*
+ * rsSample: Sample a value from a texture allocation
+ *
+ * Fetches a value from a texture allocation in a way described by the sampler.
+ *
+ * If your allocation is 1D, use the variant with float for location.  For 2D,
+ * use the float2 variant.
+ *
+ * See android.renderscript.Sampler for more details.
+ *
+ * Parameters:
+ *   a: Allocation to sample from.
+ *   s: Sampler state.
+ *   location: Location to sample from.
+ *   lod: Mip level to sample from, for fractional values mip levels will be interpolated if RS_SAMPLER_LINEAR_MIP_LINEAR is used.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float location, float lod);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float4 __attribute__((overloadable))
+    rsSample(rs_allocation a, rs_sampler s, float2 location, float lod);
+#endif
+
+/*
+ * rsSetElementAt: Set a cell of an allocation
+ *
+ * This function stores a value into a single cell of an allocation.
+ *
+ * When storing into a three dimensional allocations, use the x, y, z variant.
+ * Similarly, use the x, y variant for two dimensional allocations and x for
+ * the mono dimensional allocations.
+ *
+ * This function has two styles.  One passes the value to be stored using a void*,
+ * the other has the actual value as an argument, e.g. rsSetElementAt() vs.
+ * rsSetElementAt_int4().  For primitive types, always use the latter as it is
+ * more efficient.
+ *
+ * See also rsGetElementAt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float(rs_allocation a, float val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float2(rs_allocation a, float2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float3(rs_allocation a, float3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_float4(rs_allocation a, float4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double(rs_allocation a, double val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double2(rs_allocation a, double2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double3(rs_allocation a, double3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_double4(rs_allocation a, double4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char(rs_allocation a, char val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char2(rs_allocation a, char2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char3(rs_allocation a, char3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_char4(rs_allocation a, char4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar(rs_allocation a, uchar val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar2(rs_allocation a, uchar2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar3(rs_allocation a, uchar3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uchar4(rs_allocation a, uchar4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short(rs_allocation a, short val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short2(rs_allocation a, short2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short3(rs_allocation a, short3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_short4(rs_allocation a, short4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort(rs_allocation a, ushort val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort2(rs_allocation a, ushort2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort3(rs_allocation a, ushort3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ushort4(rs_allocation a, ushort4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int(rs_allocation a, int val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int2(rs_allocation a, int2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int3(rs_allocation a, int3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_int4(rs_allocation a, int4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint(rs_allocation a, uint val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint2(rs_allocation a, uint2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint3(rs_allocation a, uint3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_uint4(rs_allocation a, uint4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long(rs_allocation a, long val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long2(rs_allocation a, long2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long3(rs_allocation a, long3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_long4(rs_allocation a, long4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong(rs_allocation a, ulong val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong2(rs_allocation a, ulong2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong3(rs_allocation a, ulong3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern void __attribute__((overloadable))
+    rsSetElementAt_ulong4(rs_allocation a, ulong4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x, uint32_t y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half(rs_allocation a, half val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half2(rs_allocation a, half2 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half3(rs_allocation a, half3 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsSetElementAt_half4(rs_allocation a, half4 val, uint32_t x, uint32_t y, uint32_t z);
+#endif
+
+#endif // RENDERSCRIPT_RS_ALLOCATION_DATA_RSH

diff --git a/25.0.2/include/rs_atomic.rsh b/25.0.2/include/rs_atomic.rsh
new file mode 100644
index 0000000..98a8784
--- /dev/null
+++ b/25.0.2/include/rs_atomic.rsh

@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_atomic.rsh: Atomic Update Functions
+ *
+ * To update values shared between multiple threads, use the functions below.
+ * They ensure that the values are atomically updated, i.e. that the memory
+ * reads, the updates, and the memory writes are done in the right order.
+ *
+ * These functions are slower than their non-atomic equivalents, so use
+ * them only when synchronization is needed.
+ *
+ * Note that in RenderScript, your code is likely to be running in separate
+ * threads even though you did not explicitely create them.  The RenderScript
+ * runtime will very often split the execution of one kernel across multiple
+ * threads.  Updating globals should be done with atomic functions.  If possible,
+ * modify your algorithm to avoid them altogether.
+ */
+
+#ifndef RENDERSCRIPT_RS_ATOMIC_RSH
+#define RENDERSCRIPT_RS_ATOMIC_RSH
+
+/*
+ * rsAtomicAdd: Thread-safe addition
+ *
+ * Atomicly adds a value to the value at addr, i.e. *addr += value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Amount to add.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAdd(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicAnd: Thread-safe bitwise and
+ *
+ * Atomicly performs a bitwise and of two values, storing the result back at addr,
+ * i.e. *addr &= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to and with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicAnd(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicCas: Thread-safe compare and set
+ *
+ * If the value at addr matches compareValue then the newValue is written at addr,
+ * i.e. if (*addr == compareValue) { *addr = newValue; }.
+ *
+ * You can check that the value was written by checking that the value returned
+ * by rsAtomicCas() is compareValue.
+ *
+ * Parameters:
+ *   addr: Address of the value to compare and replace if the test passes.
+ *   compareValue: Value to test *addr against.
+ *   newValue: Value to write if the test passes.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicCas(volatile int32_t* addr, int32_t compareValue, int32_t newValue);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicCas(volatile uint32_t* addr, uint32_t compareValue, uint32_t newValue);
+#endif
+
+/*
+ * rsAtomicDec: Thread-safe decrement
+ *
+ * Atomicly subtracts one from the value at addr.  This is equivalent to rsAtomicSub(addr, 1).
+ *
+ * Parameters:
+ *   addr: Address of the value to decrement.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile int32_t* addr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicDec(volatile uint32_t* addr);
+#endif
+
+/*
+ * rsAtomicInc: Thread-safe increment
+ *
+ * Atomicly adds one to the value at addr.  This is equivalent to rsAtomicAdd(addr, 1).
+ *
+ * Parameters:
+ *   addr: Address of the value to increment.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile int32_t* addr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicInc(volatile uint32_t* addr);
+#endif
+
+/*
+ * rsAtomicMax: Thread-safe maximum
+ *
+ * Atomicly sets the value at addr to the maximum of *addr and value, i.e.
+ * *addr = max(*addr, value).
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Comparison value.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMax(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicMax(volatile int32_t* addr, int32_t value);
+#endif
+
+/*
+ * rsAtomicMin: Thread-safe minimum
+ *
+ * Atomicly sets the value at addr to the minimum of *addr and value, i.e.
+ * *addr = min(*addr, value).
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Comparison value.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern uint32_t __attribute__((overloadable))
+    rsAtomicMin(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicMin(volatile int32_t* addr, int32_t value);
+#endif
+
+/*
+ * rsAtomicOr: Thread-safe bitwise or
+ *
+ * Atomicly perform a bitwise or two values, storing the result at addr,
+ * i.e. *addr |= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to or with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicOr(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicSub: Thread-safe subtraction
+ *
+ * Atomicly subtracts a value from the value at addr, i.e. *addr -= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Amount to subtract.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicSub(volatile uint32_t* addr, uint32_t value);
+#endif
+
+/*
+ * rsAtomicXor: Thread-safe bitwise exclusive or
+ *
+ * Atomicly performs a bitwise xor of two values, storing the result at addr,
+ * i.e. *addr ^= value.
+ *
+ * Parameters:
+ *   addr: Address of the value to modify.
+ *   value: Value to xor with.
+ *
+ * Returns: Value of *addr prior to the operation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile int32_t* addr, int32_t value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+extern int32_t __attribute__((overloadable))
+    rsAtomicXor(volatile uint32_t* addr, uint32_t value);
+#endif
+
+#endif // RENDERSCRIPT_RS_ATOMIC_RSH

diff --git a/25.0.2/include/rs_convert.rsh b/25.0.2/include/rs_convert.rsh
new file mode 100644
index 0000000..4c318d4
--- /dev/null
+++ b/25.0.2/include/rs_convert.rsh

@@ -0,0 +1,1623 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_convert.rsh: Conversion Functions
+ *
+ * The functions below convert from a numerical vector type to another, or from one color
+ * representation to another.
+ */
+
+#ifndef RENDERSCRIPT_RS_CONVERT_RSH
+#define RENDERSCRIPT_RS_CONVERT_RSH
+
+/*
+ * convert: Convert numerical vectors
+ *
+ * Converts a vector from one numerical type to another.  The conversion are done entry per entry.
+ *
+ * E.g calling a = convert_short3(b); is equivalent to doing
+ * a.x = (short)b.x; a.y = (short)b.y; a.z = (short)b.z;.
+ *
+ * Converting floating point values to integer types truncates.
+ *
+ * Converting numbers too large to fit the destination type yields undefined results.
+ * For example, converting a float that contains 1.0e18 to a short is undefined.
+ * Use clamp() to avoid this.
+ */
+extern float2 __attribute__((const, overloadable))
+    convert_float2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(float4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(char2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(char3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(char4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(uchar2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(uchar3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(uchar4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(short2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(short3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(short4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(ushort2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(ushort3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(ushort4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(int2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(int3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(int4 v);
+
+extern float2 __attribute__((const, overloadable))
+    convert_float2(uint2 v);
+
+extern float3 __attribute__((const, overloadable))
+    convert_float3(uint3 v);
+
+extern float4 __attribute__((const, overloadable))
+    convert_float4(uint4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(float2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(float3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(float4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(char2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(char3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(char4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(uchar2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(uchar3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(uchar4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(short2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(short3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(short4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(ushort2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(ushort3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(ushort4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(int2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(int3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(int4 v);
+
+extern char2 __attribute__((const, overloadable))
+    convert_char2(uint2 v);
+
+extern char3 __attribute__((const, overloadable))
+    convert_char3(uint3 v);
+
+extern char4 __attribute__((const, overloadable))
+    convert_char4(uint4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(float2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(float3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(float4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(char2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(char3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(char4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(uchar2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(uchar3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(uchar4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(short2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(short3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(short4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(ushort2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(ushort3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(ushort4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(int2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(int3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(int4 v);
+
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(uint2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(uint3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(uint4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(float2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(float3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(float4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(char2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(char3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(char4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(uchar2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(uchar3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(uchar4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(short2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(short3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(short4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(ushort2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(ushort3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(ushort4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(int2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(int3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(int4 v);
+
+extern short2 __attribute__((const, overloadable))
+    convert_short2(uint2 v);
+
+extern short3 __attribute__((const, overloadable))
+    convert_short3(uint3 v);
+
+extern short4 __attribute__((const, overloadable))
+    convert_short4(uint4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(float2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(float3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(float4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(char2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(char3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(char4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(uchar2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(uchar3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(uchar4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(short2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(short3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(short4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(ushort2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(ushort3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(ushort4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(int2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(int3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(int4 v);
+
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(uint2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(uint3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(uint4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(float2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(float3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(float4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(char2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(char3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(char4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(uchar2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(uchar3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(uchar4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(short2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(short3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(short4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(ushort2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(ushort3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(ushort4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(int2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(int3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(int4 v);
+
+extern int2 __attribute__((const, overloadable))
+    convert_int2(uint2 v);
+
+extern int3 __attribute__((const, overloadable))
+    convert_int3(uint3 v);
+
+extern int4 __attribute__((const, overloadable))
+    convert_int4(uint4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(float2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(float3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(float4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(char2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(char3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(char4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(uchar2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(uchar3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(uchar4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(short2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(short3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(short4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(ushort2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(ushort3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(ushort4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(int2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(int3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(int4 v);
+
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(uint2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(uint3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(uint4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(ulong4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float2 __attribute__((const, overloadable))
+    convert_float2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float3 __attribute__((const, overloadable))
+    convert_float3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float4 __attribute__((const, overloadable))
+    convert_float4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern double2 __attribute__((const, overloadable))
+    convert_double2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern double3 __attribute__((const, overloadable))
+    convert_double3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern double4 __attribute__((const, overloadable))
+    convert_double4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern char2 __attribute__((const, overloadable))
+    convert_char2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern char3 __attribute__((const, overloadable))
+    convert_char3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern char4 __attribute__((const, overloadable))
+    convert_char4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uchar2 __attribute__((const, overloadable))
+    convert_uchar2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uchar3 __attribute__((const, overloadable))
+    convert_uchar3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uchar4 __attribute__((const, overloadable))
+    convert_uchar4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern short2 __attribute__((const, overloadable))
+    convert_short2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern short3 __attribute__((const, overloadable))
+    convert_short3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern short4 __attribute__((const, overloadable))
+    convert_short4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ushort2 __attribute__((const, overloadable))
+    convert_ushort2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ushort3 __attribute__((const, overloadable))
+    convert_ushort3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ushort4 __attribute__((const, overloadable))
+    convert_ushort4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int2 __attribute__((const, overloadable))
+    convert_int2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int3 __attribute__((const, overloadable))
+    convert_int3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int4 __attribute__((const, overloadable))
+    convert_int4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uint2 __attribute__((const, overloadable))
+    convert_uint2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uint3 __attribute__((const, overloadable))
+    convert_uint3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern uint4 __attribute__((const, overloadable))
+    convert_uint4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern long2 __attribute__((const, overloadable))
+    convert_long2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern long3 __attribute__((const, overloadable))
+    convert_long3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern long4 __attribute__((const, overloadable))
+    convert_long4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ulong2 __attribute__((const, overloadable))
+    convert_ulong2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ulong3 __attribute__((const, overloadable))
+    convert_ulong3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern ulong4 __attribute__((const, overloadable))
+    convert_ulong4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(double2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(double3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(double4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(char2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(char3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(char4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(uchar2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(uchar3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(uchar4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(short2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(short3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(short4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(ushort2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(ushort3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(ushort4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(int2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(int3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(int4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(uint2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(uint3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(uint4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(long2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(long3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(long4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    convert_half2(ulong2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    convert_half3(ulong3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    convert_half4(ulong4 v);
+#endif
+
+/*
+ * rsPackColorTo8888: Create a uchar4 RGBA from floats
+ *
+ * Packs three or four floating point RGBA values into a uchar4.
+ *
+ * The input values are typically between 0.0f and 1.0f inclusive.  For input values outside
+ * of this range, the resulting outputs will be clamped to be between 0 and 255.  As this
+ * clamping may be done after the input is multiplied by 255.f and converted to an integer,
+ * input numbers greater than INT_MAX/255.f or less than INT_MIN/255.f result in
+ * undefined behavior.
+ *
+ * If the alpha component is not specified, it is assumed to be 1.0, i.e. the result will
+ * have an alpha set to 255.
+ *
+ * Parameters:
+ *   r: Red component.
+ *   g: Green component.
+ *   b: Blue component.
+ *   a: Alpha component.
+ *   color: Vector of 3 or 4 floats containing the R, G, B, and A values.
+ */
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float r, float g, float b);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float r, float g, float b, float a);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float3 color);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsPackColorTo8888(float4 color);
+
+/*
+ * rsUnpackColor8888: Create a float4 RGBA from uchar4
+ *
+ * Unpacks a uchar4 color to float4.  The resulting floats will be between 0.0 and 1.0 inclusive.
+ */
+extern float4 __attribute__((const))
+    rsUnpackColor8888(uchar4 c);
+
+/*
+ * rsYuvToRGBA: Convert a YUV value to RGBA
+ *
+ * Converts a color from a YUV representation to RGBA.
+ *
+ * We currently don't provide a function to do the reverse conversion.
+ *
+ * Parameters:
+ *   y: Luminance component.
+ *   u: U chrominance component.
+ *   v: V chrominance component.
+ */
+extern float4 __attribute__((const, overloadable))
+    rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
+extern uchar4 __attribute__((const, overloadable))
+    rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+
+#endif // RENDERSCRIPT_RS_CONVERT_RSH

diff --git a/25.0.2/include/rs_core.rsh b/25.0.2/include/rs_core.rsh
new file mode 100644
index 0000000..ae93d60
--- /dev/null
+++ b/25.0.2/include/rs_core.rsh

@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_core.rsh: Overview
+ *
+ * RenderScript is a high-performance runtime that provides compute operations at the native level.
+ * RenderScript code is compiled on devices at runtime to allow platform-independence as well.
+ *
+ * This reference documentation describes the RenderScript runtime APIs, which you can utilize
+ * to write RenderScript code in C99. The RenderScript compute header files are automatically
+ * included for you.
+ *
+ * To use RenderScript, you need to utilize the RenderScript runtime APIs documented here as well
+ * as the Android framework APIs for RenderScript.  For documentation on the Android framework
+ * APIs, see the android.renderscript package reference.
+ *
+ * For more information on how to develop with RenderScript and how the runtime and Android
+ * framework APIs interact, see the RenderScript developer guide and the RenderScript samples.
+ */
+
+#ifndef RENDERSCRIPT_RS_CORE_RSH
+#define RENDERSCRIPT_RS_CORE_RSH
+
+#define RS_KERNEL __attribute__((kernel))
+
+#include "stdbool.h"
+
+#include "rs_value_types.rsh"
+#include "rs_object_types.rsh"
+
+#include "rs_allocation_create.rsh"
+#include "rs_allocation_data.rsh"
+#include "rs_atomic.rsh"
+#include "rs_convert.rsh"
+#include "rs_debug.rsh"
+#include "rs_for_each.rsh"
+#include "rs_io.rsh"
+#include "rs_math.rsh"
+#include "rs_matrix.rsh"
+#include "rs_object_info.rsh"
+#include "rs_quaternion.rsh"
+#include "rs_time.rsh"
+#include "rs_vector_math.rsh"
+
+#endif // RENDERSCRIPT_RS_CORE_RSH

diff --git a/25.0.2/include/rs_debug.rsh b/25.0.2/include/rs_debug.rsh
new file mode 100644
index 0000000..13c5faa
--- /dev/null
+++ b/25.0.2/include/rs_debug.rsh

@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_debug.rsh: Debugging Functions
+ *
+ * The functions below are intended to be used during application developement.
+ * They should not be used in shipping applications.
+ */
+
+#ifndef RENDERSCRIPT_RS_DEBUG_RSH
+#define RENDERSCRIPT_RS_DEBUG_RSH
+
+#define RS_DEBUG(a) rsDebug(#a, a)
+#define RS_DEBUG_MARKER rsDebug(__FILE__, __LINE__)
+
+/*
+ * rsDebug: Log a message and values
+ *
+ * This function prints a message to the standard log, followed by the provided values.
+ *
+ * This function is intended for debugging only and should not be used in shipping
+ * applications.
+ */
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong a);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, int4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uint4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ulong4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, double4 a);
+#endif
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float2 a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float3 a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float4 a);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, half4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, char4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, uchar4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, short4 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort2 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort3 a);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, ushort4 a);
+#endif
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b, float c);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, float a, float b, float c, float d);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, long long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, unsigned long long a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const void* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix4x4* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix3x3* a);
+
+extern void __attribute__((overloadable))
+    rsDebug(const char* message, const rs_matrix2x2* a);
+
+#endif // RENDERSCRIPT_RS_DEBUG_RSH

diff --git a/25.0.2/include/rs_for_each.rsh b/25.0.2/include/rs_for_each.rsh
new file mode 100644
index 0000000..bcc5db9
--- /dev/null
+++ b/25.0.2/include/rs_for_each.rsh

@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_for_each.rsh: Kernel Invocation Functions and Types
+ *
+ * The rsForEach() function can be used to invoke the root kernel of a script.
+ *
+ * The other functions are used to get the characteristics of the invocation of
+ * an executing kernel, like dimensions and current indices.  These functions take
+ * a rs_kernel_context as argument.
+ */
+
+#ifndef RENDERSCRIPT_RS_FOR_EACH_RSH
+#define RENDERSCRIPT_RS_FOR_EACH_RSH
+
+/*
+ * rs_for_each_strategy_t: Suggested cell processing order
+ *
+ * This type is used to suggest how the invoked kernel should iterate over the cells of the
+ * allocations.  This is a hint only.  Implementations may not follow the suggestion.
+ *
+ * This specification can help the caching behavior of the running kernel, e.g. the cache
+ * locality when the processing is distributed over multiple cores.
+ */
+typedef enum rs_for_each_strategy {
+    RS_FOR_EACH_STRATEGY_SERIAL = 0, // Prefer contiguous memory regions.
+    RS_FOR_EACH_STRATEGY_DONT_CARE = 1, // No prefrences.
+    RS_FOR_EACH_STRATEGY_DST_LINEAR = 2, // Prefer DST.
+    RS_FOR_EACH_STRATEGY_TILE_SMALL = 3, // Prefer processing small rectangular regions.
+    RS_FOR_EACH_STRATEGY_TILE_MEDIUM = 4, // Prefer processing medium rectangular regions.
+    RS_FOR_EACH_STRATEGY_TILE_LARGE = 5 // Prefer processing large rectangular regions.
+} rs_for_each_strategy_t;
+
+/*
+ * rs_kernel_context: Handle to a kernel invocation context
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over, like dimensions.  It also contains rarely used indices of the currently processed
+ * cell, like the Array0 index or the current level of detail.
+ *
+ * You can access the kernel context by adding a special parameter named "context" of type
+ * rs_kernel_context to your kernel function.  See rsGetDimX() and rsGetArray0() for examples.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef const struct rs_kernel_context_t * rs_kernel_context;
+#endif
+
+/*
+ * rs_script_call_t: Cell iteration information
+ *
+ * This structure is used to provide iteration information to a rsForEach call.
+ * It is currently used to restrict processing to a subset of cells.  In future
+ * versions, it will also be used to provide hint on how to best iterate over
+ * the cells.
+ *
+ * The Start fields are inclusive and the End fields are exclusive.  E.g. to iterate
+ * over cells 4, 5, 6, and 7 in the X dimension, set xStart to 4 and xEnd to 8.
+ */
+typedef struct rs_script_call {
+    rs_for_each_strategy_t strategy; // Currently ignored.  In the future, will be suggested cell iteration strategy.
+    uint32_t xStart; // Starting index in the X dimension.
+    uint32_t xEnd; // Ending index (exclusive) in the X dimension.
+    uint32_t yStart; // Starting index in the Y dimension.
+    uint32_t yEnd; // Ending index (exclusive) in the Y dimension.
+    uint32_t zStart; // Starting index in the Z dimension.
+    uint32_t zEnd; // Ending index (exclusive) in the Z dimension.
+    uint32_t arrayStart; // Starting index in the Array0 dimension.
+    uint32_t arrayEnd; // Ending index (exclusive) in the Array0 dimension.
+    uint32_t array1Start; // Starting index in the Array1 dimension.
+    uint32_t array1End; // Ending index (exclusive) in the Array1 dimension.
+    uint32_t array2Start; // Starting index in the Array2 dimension.
+    uint32_t array2End; // Ending index (exclusive) in the Array2 dimension.
+    uint32_t array3Start; // Starting index in the Array3 dimension.
+    uint32_t array3End; // Ending index (exclusive) in the Array3 dimension.
+} rs_script_call_t;
+
+/*
+ * rs_kernel: Handle to a kernel function
+ *
+ *  An opaque type for a function that is defined with the kernel attribute.  A value
+ *  of this type can be used in a rsForEach call to launch a kernel.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+typedef void* rs_kernel;
+#endif
+
+/*
+ * rsForEach: Launches a kernel
+ *
+ * Runs the kernel over zero or more input allocations. They are passed after the
+ * rs_kernel argument. If the specified kernel returns a value, an output allocation
+ * must be specified as the last argument. All input allocations,
+ * and the output allocation if it exists, must have the same dimensions.
+ *
+ * This is a synchronous function. A call to this function only returns after all
+ * the work has completed for all cells of the input allocations. If the kernel
+ * function returns any value, the call waits until all results have been written
+ * to the output allocation.
+ *
+ * Up to API level 23, the kernel is implicitly specified as the kernel named
+ * "root" in the specified script, and only a single input allocation can be used.
+ * Starting in API level 24, an arbitrary kernel function can be used,
+ * as specified by the kernel argument. The script argument is removed.
+ * The kernel must be defined in the current script. In addition, more than one
+ * input can be used.
+ *
+ * E.g.
+ * float __attribute__((kernel)) square(float a) {
+ *   return a * a;
+ * }
+ *
+ * void compute(rs_allocation ain, rs_allocation aout) {
+ *   rsForEach(square, ain, aout);
+ * }
+ *
+ *
+ * Parameters:
+ *   script: Script to call.
+ *   input: Allocation to source data from.
+ *   output: Allocation to write date into.
+ *   usrData: User defined data to pass to the script.  May be NULL.
+ *   sc: Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL.
+ *   usrDataLen: Size of the userData structure.  This will be used to perform a shallow copy of the data if necessary.
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   ...: Input and output allocations
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              const rs_script_call_t* sc);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 20))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              size_t usrDataLen, const rs_script_call_t* sc);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 20))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output, const void* usrData,
+              size_t usrDataLen);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 23))
+extern void __attribute__((overloadable))
+    rsForEach(rs_script script, rs_allocation input, rs_allocation output);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void
+    rsForEach(rs_kernel kernel, ...);
+#endif
+
+/*
+ * rsForEachWithOptions: Launches a kernel with options
+ *
+ * Launches kernel in a way similar to rsForEach. However, instead of processing
+ * all cells in the input, this function only processes cells in the subspace of
+ * the index space specified in options. With the index space explicitly specified
+ * by options, no input or output allocation is required for a kernel launch using
+ * this API. If allocations are passed in, they must match the number of arguments
+ * and return value expected by the kernel function. The output allocation is
+ * present if and only if the kernel has a non-void return value.
+ *
+ * E.g.,
+ *    rs_script_call_t opts = {0};
+ *    opts.xStart = 0;
+ *    opts.xEnd = dimX;
+ *    opts.yStart = 0;
+ *    opts.yEnd = dimY / 2;
+ *    rsForEachWithOptions(foo, &opts, out, out);
+ *
+ *
+ * Parameters:
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   options: Launch options
+ *   ...: Input and output allocations
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void
+    rsForEachWithOptions(rs_kernel kernel, rs_script_call_t* options, ...);
+#endif
+
+/*
+ * rsGetArray0: Index in the Array0 dimension for the specified kernel context
+ *
+ * Returns the index in the Array0 dimension of the cell being processed, as specified
+ * by the supplied kernel context.
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over and rarely used indices, like the Array0 index.
+ *
+ * You can access the kernel context by adding a special parameter named "context" of
+ * type rs_kernel_context to your kernel function.  E.g.
+ * short RS_KERNEL myKernel(short value, uint32_t x, rs_kernel_context context) {
+ *   // The current index in the common x, y, z dimensions are accessed by
+ *   // adding these variables as arguments.  For the more rarely used indices
+ *   // to the other dimensions, extract them from the kernel context:
+ *   uint32_t index_a0 = rsGetArray0(context);
+ *   //...
+ * }
+ *
+ * This function returns 0 if the Array0 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray0(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray1: Index in the Array1 dimension for the specified kernel context
+ *
+ * Returns the index in the Array1 dimension of the cell being processed, as specified
+ * by the supplied kernel context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns 0 if the Array1 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray1(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray2: Index in the Array2 dimension for the specified kernel context
+ *
+ * Returns the index in the Array2 dimension of the cell being processed,
+ * as specified by the supplied kernel context.  See rsGetArray0() for an explanation
+ * of the context.
+ *
+ * Returns 0 if the Array2 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray2(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetArray3: Index in the Array3 dimension for the specified kernel context
+ *
+ * Returns the index in the Array3 dimension of the cell being processed, as specified
+ * by the supplied kernel context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns 0 if the Array3 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetArray3(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray0: Size of the Array0 dimension for the specified kernel context
+ *
+ * Returns the size of the Array0 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array0 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray0(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray1: Size of the Array1 dimension for the specified kernel context
+ *
+ * Returns the size of the Array1 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array1 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray1(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray2: Size of the Array2 dimension for the specified kernel context
+ *
+ * Returns the size of the Array2 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array2 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray2(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimArray3: Size of the Array3 dimension for the specified kernel context
+ *
+ * Returns the size of the Array3 dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Array3 dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimArray3(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimHasFaces: Presence of more than one face for the specified kernel context
+ *
+ * If the kernel is iterating over a cubemap, this function returns true if there's more
+ * than one face present.  In all other cases, it returns false.  See rsGetDimX() for an
+ * explanation of the context.
+ *
+ * rsAllocationGetDimFaces() is similar but returns 0 or 1 instead of a bool.
+ *
+ * Returns: Returns true if more than one face is present, false otherwise.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern bool __attribute__((overloadable))
+    rsGetDimHasFaces(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimLod: Number of levels of detail for the specified kernel context
+ *
+ * Returns the number of levels of detail for the specified kernel context.  This is useful
+ * for mipmaps.  See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if Level of Detail is not used.
+ *
+ * rsAllocationGetDimLOD() is similar but returns 0 or 1 instead the actual
+ * number of levels.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimLod(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimX: Size of the X dimension for the specified kernel context
+ *
+ * Returns the size of the X dimension for the specified kernel context.
+ *
+ * The kernel context contains common characteristics of the allocations being iterated
+ * over and rarely used indices, like the Array0 index.
+ *
+ * You can access it by adding a special parameter named "context" of
+ * type rs_kernel_context to your kernel function.  E.g.
+ * int4 RS_KERNEL myKernel(int4 value, rs_kernel_context context) {
+ *   uint32_t size = rsGetDimX(context); //...
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimX().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimX(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimY: Size of the Y dimension for the specified kernel context
+ *
+ * Returns the size of the X dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Y dimension is not present.
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimY().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimY(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetDimZ: Size of the Z dimension for the specified kernel context
+ *
+ * Returns the size of the Z dimension for the specified kernel context.
+ * See rsGetDimX() for an explanation of the context.
+ *
+ * Returns 0 if the Z dimension is not present.
+ *
+ * To get the dimension of specific allocation, use rsAllocationGetDimZ().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetDimZ(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetFace: Coordinate of the Face for the specified kernel context
+ *
+ * Returns the face on which the cell being processed is found, as specified by the
+ * supplied kernel context.  See rsGetArray0() for an explanation of the context.
+ *
+ * Returns RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X if the face dimension is not
+ * present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern rs_allocation_cubemap_face __attribute__((overloadable))
+    rsGetFace(rs_kernel_context context);
+#endif
+
+/*
+ * rsGetLod: Index in the Levels of Detail dimension for the specified kernel context
+ *
+ * Returns the index in the Levels of Detail dimension of the cell being processed,
+ * as specified by the supplied kernel context.  See rsGetArray0() for an explanation of
+ * the context.
+ *
+ * Returns 0 if the Levels of Detail dimension is not present.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+extern uint32_t __attribute__((overloadable))
+    rsGetLod(rs_kernel_context context);
+#endif
+
+#endif // RENDERSCRIPT_RS_FOR_EACH_RSH

diff --git a/25.0.2/include/rs_graphics.rsh b/25.0.2/include/rs_graphics.rsh
new file mode 100644
index 0000000..10ec640
--- /dev/null
+++ b/25.0.2/include/rs_graphics.rsh

@@ -0,0 +1,1522 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_graphics.rsh: Graphics Functions and Types
+ *
+ * The graphics subsystem of RenderScript was removed at API level 23.
+ */
+
+#ifndef RENDERSCRIPT_RS_GRAPHICS_RSH
+#define RENDERSCRIPT_RS_GRAPHICS_RSH
+
+#ifdef __LP64__
+// TODO We need to fix some of the builds before enabling this error:
+// #error "RenderScript graphics is deprecated and not supported in 64bit mode."
+#endif
+
+// TODO we seem to assume order for the other headers too.
+#include "rs_object_types.rsh"
+
+/*
+ * rs_blend_src_func: Blend source function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_BLEND_SRC_ZERO = 0,
+    RS_BLEND_SRC_ONE = 1,
+    RS_BLEND_SRC_DST_COLOR = 2,
+    RS_BLEND_SRC_ONE_MINUS_DST_COLOR = 3,
+    RS_BLEND_SRC_SRC_ALPHA = 4,
+    RS_BLEND_SRC_ONE_MINUS_SRC_ALPHA = 5,
+    RS_BLEND_SRC_DST_ALPHA = 6,
+    RS_BLEND_SRC_ONE_MINUS_DST_ALPHA = 7,
+    RS_BLEND_SRC_SRC_ALPHA_SATURATE = 8,
+    RS_BLEND_SRC_INVALID = 100
+} rs_blend_src_func;
+#endif
+#endif
+
+/*
+ * rs_blend_dst_func: Blend destination function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_BLEND_DST_ZERO = 0,
+    RS_BLEND_DST_ONE = 1,
+    RS_BLEND_DST_SRC_COLOR = 2,
+    RS_BLEND_DST_ONE_MINUS_SRC_COLOR = 3,
+    RS_BLEND_DST_SRC_ALPHA = 4,
+    RS_BLEND_DST_ONE_MINUS_SRC_ALPHA = 5,
+    RS_BLEND_DST_DST_ALPHA = 6,
+    RS_BLEND_DST_ONE_MINUS_DST_ALPHA = 7,
+    RS_BLEND_DST_INVALID = 100
+} rs_blend_dst_func;
+#endif
+#endif
+
+/*
+ * rs_cull_mode: Culling mode
+ *
+ * DEPRECATED.  Do not use.
+ *
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_CULL_BACK = 0,
+    RS_CULL_FRONT = 1,
+    RS_CULL_NONE = 2,
+    RS_CULL_INVALID = 100
+} rs_cull_mode;
+#endif
+#endif
+
+/*
+ * rs_depth_func: Depth function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Specifies conditional drawing depending on the comparison of the incoming
+ * depth to that found in the depth buffer.
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_DEPTH_FUNC_ALWAYS = 0, // Always drawn
+    RS_DEPTH_FUNC_LESS = 1, // Drawn if the incoming depth value is less than that in the depth buffer
+    RS_DEPTH_FUNC_LEQUAL = 2, // Drawn if the incoming depth value is less or equal to that in the depth buffer
+    RS_DEPTH_FUNC_GREATER = 3, // Drawn if the incoming depth value is greater than that in the depth buffer
+    RS_DEPTH_FUNC_GEQUAL = 4, // Drawn if the incoming depth value is greater or equal to that in the depth buffer
+    RS_DEPTH_FUNC_EQUAL = 5, // Drawn if the incoming depth value is equal to that in the depth buffer
+    RS_DEPTH_FUNC_NOTEQUAL = 6, // Drawn if the incoming depth value is not equal to that in the depth buffer
+    RS_DEPTH_FUNC_INVALID = 100 // Invalid depth function
+} rs_depth_func;
+#endif
+#endif
+
+/*
+ * rs_primitive: How to intepret mesh vertex data
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Describes the way mesh vertex data is interpreted when rendering
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+typedef enum __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) {
+    RS_PRIMITIVE_POINT = 0, // Vertex data will be rendered as a series of points
+    RS_PRIMITIVE_LINE = 1, // Vertex pairs will be rendered as lines
+    RS_PRIMITIVE_LINE_STRIP = 2, // Vertex data will be rendered as a connected line strip
+    RS_PRIMITIVE_TRIANGLE = 3, // Vertices will be rendered as individual triangles
+    RS_PRIMITIVE_TRIANGLE_STRIP = 4, // Vertices will be rendered as a connected triangle strip defined by the first three vertices with each additional triangle defined by a new vertex
+    RS_PRIMITIVE_TRIANGLE_FAN = 5, // Vertices will be rendered as a sequence of triangles that all share first vertex as the origin
+    RS_PRIMITIVE_INVALID = 100 // Invalid primitive
+} rs_primitive;
+#endif
+#endif
+
+/*
+ * rs_font: Handle to a Font
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript font object.
+ * See: android.renderscript.Font
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_font _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_font;
+#endif
+#endif
+
+/*
+ * rs_mesh: Handle to a Mesh
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript mesh object.
+ * See: android.renderscript.Mesh
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_mesh _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_mesh;
+#endif
+#endif
+
+/*
+ * rs_program_fragment: Handle to a ProgramFragment
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramFragment object.
+ * See: android.renderscript.ProgramFragment
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_fragment _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_fragment;
+#endif
+#endif
+
+/*
+ * rs_program_vertex: Handle to a ProgramVertex
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramVertex object.
+ * See: android.renderscript.ProgramVertex
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_vertex _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_vertex;
+#endif
+#endif
+
+/*
+ * rs_program_raster: Handle to a ProgramRaster
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramRaster object.
+ * See: android.renderscript.ProgramRaster
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_raster _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_raster;
+#endif
+#endif
+
+/*
+ * rs_program_store: Handle to a ProgramStore
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Opaque handle to a RenderScript ProgramStore object.
+ * See: android.renderscript.ProgramStore
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+typedef struct rs_program_store _RS_OBJECT_DECL __attribute__((
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+deprecated
+#endif
+)) rs_program_store;
+#endif
+#endif
+
+/*
+ * rsClearObject: Release an object
+ *
+ * Tells the run time that this handle will no longer be used to access the the related
+ * object.  If this was the last handle to that object, resource recovery may happen.
+ *
+ * After calling this function, *dst will be set to an empty handle.  See rsIsObject().
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_mesh* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_fragment* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_vertex* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_raster* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_program_store* dst);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsClearObject(rs_font* dst);
+#endif
+#endif
+
+/*
+ * rsIsObject: Check for an empty handle
+ *
+ * Returns true if the handle contains a non-null reference.
+ *
+ * This function does not validate that the internal pointer used in the handle
+ * points to an actual valid object; it only checks for null.
+ *
+ * This function can be used to check the Element returned by rsElementGetSubElement()
+ * or see if rsClearObject() has been called on a handle.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_mesh v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_fragment v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_vertex v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_raster v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_program_store v);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_font v);
+#endif
+#endif
+
+/*
+ * rsSetObject: For internal use.
+ *
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_mesh* dst, rs_mesh src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_fragment* dst, rs_program_fragment src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_vertex* dst, rs_program_vertex src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_raster* dst, rs_program_raster src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_program_store* dst, rs_program_store src);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (RS_VERSION <= 22)
+extern void __attribute__((overloadable))
+    rsSetObject(rs_font* dst, rs_font src);
+#endif
+#endif
+
+/*
+ * rsgAllocationSyncAll: Sync the contents of an allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Sync the contents of an allocation.
+ *
+ * If the source is specified, sync from memory space specified by source.
+ *
+ * If the source is not specified, sync from its SCRIPT memory space to its HW
+ * memory spaces.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgAllocationSyncAll(rs_allocation alloc);
+#endif
+#endif
+
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgAllocationSyncAll(rs_allocation alloc, rs_allocation_usage_type source);
+#endif
+#endif
+
+/*
+ * rsgBindColorTarget: Set the color target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the color target used for all subsequent rendering calls
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindColorTarget(rs_allocation colorTarget, uint slot);
+#endif
+#endif
+
+/*
+ * rsgBindConstant: Bind a constant allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Allocation object to a ProgramFragment or ProgramVertex.
+ * The Allocation must be a valid constant input for the Program.
+ *
+ * Parameters:
+ *   ps: program fragment object
+ *   slot: index of the constant buffer on the program
+ *   c: constants to bind
+ *   pv: program vertex object
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindConstant(rs_program_fragment ps, uint slot, rs_allocation c);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindConstant(rs_program_vertex pv, uint slot, rs_allocation c);
+#endif
+#endif
+
+/*
+ * rsgBindDepthTarget: Set the depth target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the depth target used for all subsequent rendering calls
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindDepthTarget(rs_allocation depthTarget);
+#endif
+#endif
+
+/*
+ * rsgBindFont: Bind a font object
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Binds the font object to be used for all subsequent font rendering calls
+ *
+ * Parameters:
+ *   font: object to bind
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindFont(rs_font font);
+#endif
+#endif
+
+/*
+ * rsgBindProgramFragment: Bind a ProgramFragment
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramFragment to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramFragment(rs_program_fragment pf);
+#endif
+#endif
+
+/*
+ * rsgBindProgramRaster: Bind a ProgramRaster
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramRaster to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramRaster(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgBindProgramStore: Bind a ProgramStore
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramStore to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramStore(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgBindProgramVertex: Bind a ProgramVertex
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new ProgramVertex to the rendering context.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindProgramVertex(rs_program_vertex pv);
+#endif
+#endif
+
+/*
+ * rsgBindSampler: Bind a sampler
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Sampler object to a ProgramFragment.  The sampler will
+ * operate on the texture bound at the matching slot.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindSampler(rs_program_fragment fragment, uint slot, rs_sampler sampler);
+#endif
+#endif
+
+/*
+ * rsgBindTexture: Bind a texture allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Bind a new Allocation object to a ProgramFragment.  The
+ * Allocation must be a valid texture for the Program.  The sampling
+ * of the texture will be controled by the Sampler bound at the
+ * matching slot.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgBindTexture(rs_program_fragment v, uint slot, rs_allocation alloc);
+#endif
+#endif
+
+/*
+ * rsgClearAllRenderTargets: Clear all color and depth targets
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear all color and depth targets and resume rendering into
+ * the framebuffer
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearAllRenderTargets(void);
+#endif
+#endif
+
+/*
+ * rsgClearColor: Clear the specified color from the surface
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clears the rendering surface to the specified color.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearColor(float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgClearColorTarget: Clear the color target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear the previously set color target
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearColorTarget(uint slot);
+#endif
+#endif
+
+/*
+ * rsgClearDepth: Clear the depth surface
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clears the depth suface to the specified value.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearDepth(float value);
+#endif
+#endif
+
+/*
+ * rsgClearDepthTarget: Clear the depth target
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clear the previously set depth target
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgClearDepthTarget(void);
+#endif
+#endif
+
+/*
+ * rsgDrawMesh: Draw a mesh
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Draw a mesh using the current context state.
+ *
+ * If primitiveIndex is specified, draw part of a mesh using the current context state.
+ *
+ * If start and len are also specified, draw specified index range of part of a mesh using the current context state.
+ *
+ * Otherwise the whole mesh is rendered.
+ *
+ * Parameters:
+ *   ism: mesh object to render
+ *   primitiveIndex: for meshes that contain multiple primitive groups this parameter specifies the index of the group to draw.
+ *   start: starting index in the range
+ *   len: number of indices to draw
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawMesh(rs_mesh ism, uint primitiveIndex, uint start, uint len);
+#endif
+#endif
+
+/*
+ * rsgDrawQuad: Draw a quad
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a simple quad.  Not intended for
+ * drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawQuad(float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3,
+                float z3, float x4, float y4, float z4);
+#endif
+#endif
+
+/*
+ * rsgDrawQuadTexCoords: Draw a textured quad
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a textured quad.  Not intended
+ * for drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawQuadTexCoords(float x1, float y1, float z1, float u1, float v1, float x2, float y2,
+                         float z2, float u2, float v2, float x3, float y3, float z3, float u3,
+                         float v3, float x4, float y4, float z4, float u4, float v4);
+#endif
+#endif
+
+/*
+ * rsgDrawRect: Draw a rectangle
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance utility function for drawing a simple rectangle.  Not
+ * intended for drawing large quantities of geometry.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawRect(float x1, float y1, float x2, float y2, float z);
+#endif
+#endif
+
+/*
+ * rsgDrawSpriteScreenspace: Draw rectangles in screenspace
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Low performance function for drawing rectangles in screenspace.  This
+ * function uses the default passthough ProgramVertex.  Any bound ProgramVertex
+ * is ignored.  This function has considerable overhead and should not be used
+ * for drawing in shipping applications.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawSpriteScreenspace(float x, float y, float z, float w, float h);
+#endif
+#endif
+
+/*
+ * rsgDrawText: Draw a text string
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Draws text given a string and location
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawText(const char* text, int x, int y);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgDrawText(rs_allocation alloc, int x, int y);
+#endif
+#endif
+
+/*
+ * rsgFinish: End rendering commands
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Force RenderScript to finish all rendering commands
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgFinish(void);
+#endif
+#endif
+
+/*
+ * rsgFontColor: Set the font color
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Sets the font color for all subsequent rendering calls
+ *
+ * Parameters:
+ *   r: red component
+ *   g: green component
+ *   b: blue component
+ *   a: alpha component
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgFontColor(float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgGetHeight: Get the surface height
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the height of the current rendering surface.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgGetHeight(void);
+#endif
+#endif
+
+/*
+ * rsgGetWidth: Get the surface width
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the width of the current rendering surface.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern uint __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgGetWidth(void);
+#endif
+#endif
+
+/*
+ * rsgMeasureText: Get the bounding box for a text string
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the bounding box of the text relative to (0, 0)
+ * Any of left, right, top, bottom could be NULL
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeasureText(const char* text, int* left, int* right, int* top, int* bottom);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeasureText(rs_allocation alloc, int* left, int* right, int* top, int* bottom);
+#endif
+#endif
+
+/*
+ * rsgMeshComputeBoundingBox: Compute a bounding box
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Computes an axis aligned bounding box of a mesh object
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float* minX, float* minY, float* min, float* maxX,
+                              float* maxY, float* maxZ);
+#endif
+#endif
+
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+static inline void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshComputeBoundingBox(rs_mesh mesh, float3* bBoxMin, float3* bBoxMax) {
+    float x1, y1, z1, x2, y2, z2;
+    rsgMeshComputeBoundingBox(mesh, &x1, &y1, &z1, &x2, &y2, &z2);
+    bBoxMin->x = x1;
+    bBoxMin->y = y1;
+    bBoxMin->z = z1;
+    bBoxMax->x = x2;
+    bBoxMax->y = y2;
+    bBoxMax->z = z2;
+}
+#endif
+#endif
+
+/*
+ * rsgMeshGetIndexAllocation: Return an allocation containing index data
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns an allocation containing index data or a null
+ * allocation if only the primitive is specified
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the index allocation
+ *
+ * Returns: allocation containing index data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetIndexAllocation(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetPrimitive: Return the primitive
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the primitive describing how a part of the mesh is
+ * rendered
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the primitive
+ *
+ * Returns: primitive describing how the mesh is rendered
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_primitive __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetPrimitive(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetPrimitiveCount: Return the number of index sets
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Meshes could have multiple index sets, this function returns
+ * the number.
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *
+ * Returns: number of primitive groups in the mesh. This would include simple primitives as well as allocations containing index data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint32_t __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetPrimitiveCount(rs_mesh m);
+#endif
+#endif
+
+/*
+ * rsgMeshGetVertexAllocation: Return a vertex allocation
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns an allocation that is part of the mesh and contains
+ * vertex data, e.g. positions, normals, texcoords
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *   index: index of the vertex allocation
+ *
+ * Returns: allocation containing vertex data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetVertexAllocation(rs_mesh m, uint32_t index);
+#endif
+#endif
+
+/*
+ * rsgMeshGetVertexAllocationCount: Return the number of vertex allocations
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the number of allocations in the mesh that contain
+ * vertex data
+ *
+ * Parameters:
+ *   m: mesh to get data from
+ *
+ * Returns: number of allocations in the mesh that contain vertex data
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern uint32_t __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgMeshGetVertexAllocationCount(rs_mesh m);
+#endif
+#endif
+
+/*
+ * rsgProgramFragmentConstantColor: Set the constant color for a fixed function emulation program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Set the constant color for a fixed function emulation program.
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramFragmentConstantColor(rs_program_fragment pf, float r, float g, float b, float a);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexGetProjectionMatrix: Get the projection matrix for a fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   proj: matrix to store the current projection matrix into
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexGetProjectionMatrix(rs_matrix4x4* proj);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadModelMatrix: Load the model matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the model matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   model: model matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadModelMatrix(const rs_matrix4x4* model);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadProjectionMatrix: Load the projection matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the projection matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   proj: projection matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadProjectionMatrix(const rs_matrix4x4* proj);
+#endif
+#endif
+
+/*
+ * rsgProgramVertexLoadTextureMatrix: Load the texture matrix for a bound fixed function vertex program
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Load the texture matrix for a currently bound fixed function
+ * vertex program. Calling this function with a custom vertex shader
+ * would result in an error.
+ *
+ * Parameters:
+ *   tex: texture matrix
+ */
+#ifndef __LP64__
+#if !defined(RS_VERSION) || (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22)
+extern void __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramVertexLoadTextureMatrix(const rs_matrix4x4* tex);
+#endif
+#endif
+
+/*
+ * rsgProgramRasterGetCullMode: Get program raster cull mode
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program raster cull mode
+ *
+ * Parameters:
+ *   pr: program raster to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_cull_mode __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramRasterGetCullMode(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgProgramRasterIsPointSpriteEnabled: Get program raster point sprite state
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program raster point sprite state
+ *
+ * Parameters:
+ *   pr: program raster to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramRasterIsPointSpriteEnabled(rs_program_raster pr);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetBlendDstFunc: Get program store blend destination function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blend destination function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_blend_dst_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetBlendDstFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetBlendSrcFunc: Get program store blend source function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blend source function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_blend_src_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetBlendSrcFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreGetDepthFunc: Get program store depth function
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store depth function
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern rs_depth_func __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreGetDepthFunc(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskAlphaEnabled: Get program store alpha component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store alpha component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskAlphaEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskBlueEnabled: Get program store blur component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store blur component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskBlueEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskGreenEnabled: Get program store green component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store green component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskGreenEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsColorMaskRedEnabled: Get program store red component color mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store red component color mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsColorMaskRedEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsDepthMaskEnabled: Get program store depth mask
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store depth mask
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsDepthMaskEnabled(rs_program_store ps);
+#endif
+#endif
+
+/*
+ * rsgProgramStoreIsDitherEnabled: Get program store dither state
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Get program store dither state
+ *
+ * Parameters:
+ *   ps: program store to query
+ */
+#ifndef __LP64__
+#if (defined(RS_VERSION) && (RS_VERSION >= 16) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 22))
+extern bool __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated
+#endif
+))
+    rsgProgramStoreIsDitherEnabled(rs_program_store ps);
+#endif
+#endif
+
+#endif // RENDERSCRIPT_RS_GRAPHICS_RSH

diff --git a/25.0.2/include/rs_io.rsh b/25.0.2/include/rs_io.rsh
new file mode 100644
index 0000000..2ffbe4b
--- /dev/null
+++ b/25.0.2/include/rs_io.rsh

@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_io.rsh: Input/Output Functions
+ *
+ * These functions are used to:
+ * - Send information to the Java client, and
+ * - Send the processed allocation or receive the next allocation to process.
+ */
+
+#ifndef RENDERSCRIPT_RS_IO_RSH
+#define RENDERSCRIPT_RS_IO_RSH
+
+/*
+ * rsAllocationIoReceive: Receive new content from the queue
+ *
+ * Receive a new set of contents from the queue.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   a: Allocation to work on.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern void __attribute__((overloadable))
+    rsAllocationIoReceive(rs_allocation a);
+#endif
+
+/*
+ * rsAllocationIoSend: Send new content to the queue
+ *
+ * Send the contents of the Allocation to the queue.
+ *
+ * This function should not be called from inside a kernel, or from any function
+ * that may be called directly or indirectly from a kernel. Doing so would cause a
+ * runtime error.
+ *
+ * Parameters:
+ *   a: Allocation to work on.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern void __attribute__((overloadable))
+    rsAllocationIoSend(rs_allocation a);
+#endif
+
+/*
+ * rsSendToClient: Send a message to the client, non-blocking
+ *
+ * Sends a message back to the client.  This call does not block.
+ * It returns true if the message was sent and false if the
+ * message queue is full.
+ *
+ * A message ID is required.  The data payload is optional.
+ *
+ * See RenderScript.RSMessageHandler.
+ *
+ * Parameters:
+ *   data: Application specific data.
+ *   len: Length of the data, in bytes.
+ */
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID);
+
+extern bool __attribute__((overloadable))
+    rsSendToClient(int cmdID, const void* data, uint len);
+
+/*
+ * rsSendToClientBlocking: Send a message to the client, blocking
+ *
+ * Sends a message back to the client.  This function will block
+ * until there is room on the message queue for this message.
+ * This function may return before the message was delivered and
+ * processed by the client.
+ *
+ * A message ID is required.  The data payload is optional.
+ *
+ * See RenderScript.RSMessageHandler.
+ *
+ * Parameters:
+ *   data: Application specific data.
+ *   len: Length of the data, in bytes.
+ */
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID);
+
+extern void __attribute__((overloadable))
+    rsSendToClientBlocking(int cmdID, const void* data, uint len);
+
+#endif // RENDERSCRIPT_RS_IO_RSH

diff --git a/25.0.2/include/rs_math.rsh b/25.0.2/include/rs_math.rsh
new file mode 100644
index 0000000..3d034d0
--- /dev/null
+++ b/25.0.2/include/rs_math.rsh

@@ -0,0 +1,6550 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_math.rsh: Mathematical Constants and Functions
+ *
+ * The mathematical functions below can be applied to scalars and vectors.   When applied
+ * to vectors, the returned value is a vector of the function applied to each entry of the input.
+ *
+ * For example:
+ * float3 a, b;
+ * // The following call sets
+ * //   a.x to sin(b.x),
+ * //   a.y to sin(b.y), and
+ * //   a.z to sin(b.z).
+ * a = sin(b);
+ *
+ *
+ * See Vector Math Functions for functions like distance() and length() that interpret
+ * instead the input as a single vector in n-dimensional space.
+ *
+ * The precision of the mathematical operations on 32 bit floats is affected by the pragmas
+ * rs_fp_relaxed and rs_fp_full.  Under rs_fp_relaxed, subnormal values may be flushed to zero and
+ * rounding may be done towards zero.  In comparison, rs_fp_full requires correct handling of
+ * subnormal values, i.e. smaller than 1.17549435e-38f.  rs_fp_rull also requires round to nearest
+ * with ties to even.
+ *
+ * Different precision/speed tradeoffs can be achieved by using variants of the common math
+ * functions.  Functions with a name starting with
+ * - native_: May have custom hardware implementations with weaker precision.  Additionally,
+ *   subnormal values may be flushed to zero, rounding towards zero may be used, and NaN and
+ *   infinity input may not be handled correctly.
+ * - half_: May perform internal computations using 16 bit floats.  Additionally, subnormal
+ *   values may be flushed to zero, and rounding towards zero may be used.
+ *
+ */
+
+#ifndef RENDERSCRIPT_RS_MATH_RSH
+#define RENDERSCRIPT_RS_MATH_RSH
+
+/*
+ * M_1_PI: 1 / pi, as a 32 bit float
+ *
+ * The inverse of pi, as a 32 bit float.
+ */
+#define M_1_PI 0.318309886183790671537767526745028724f
+
+/*
+ * M_2_PI: 2 / pi, as a 32 bit float
+ *
+ * 2 divided by pi, as a 32 bit float.
+ */
+#define M_2_PI 0.636619772367581343075535053490057448f
+
+/*
+ * M_2_PIl: 2 / pi, as a 32 bit float
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * 2 divided by pi, as a 32 bit float.
+ */
+#define M_2_PIl 0.636619772367581343075535053490057448f
+
+/*
+ * M_2_SQRTPI: 2 / sqrt(pi), as a 32 bit float
+ *
+ * 2 divided by the square root of pi, as a 32 bit float.
+ */
+#define M_2_SQRTPI 1.128379167095512573896158903121545172f
+
+/*
+ * M_E: e, as a 32 bit float
+ *
+ * The number e, the base of the natural logarithm, as a 32 bit float.
+ */
+#define M_E 2.718281828459045235360287471352662498f
+
+/*
+ * M_LN10: log_e(10), as a 32 bit float
+ *
+ * The natural logarithm of 10, as a 32 bit float.
+ */
+#define M_LN10 2.302585092994045684017991454684364208f
+
+/*
+ * M_LN2: log_e(2), as a 32 bit float
+ *
+ * The natural logarithm of 2, as a 32 bit float.
+ */
+#define M_LN2 0.693147180559945309417232121458176568f
+
+/*
+ * M_LOG10E: log_10(e), as a 32 bit float
+ *
+ * The logarithm base 10 of e, as a 32 bit float.
+ */
+#define M_LOG10E 0.434294481903251827651128918916605082f
+
+/*
+ * M_LOG2E: log_2(e), as a 32 bit float
+ *
+ * The logarithm base 2 of e, as a 32 bit float.
+ */
+#define M_LOG2E 1.442695040888963407359924681001892137f
+
+/*
+ * M_PI: pi, as a 32 bit float
+ *
+ * The constant pi, as a 32 bit float.
+ */
+#define M_PI 3.141592653589793238462643383279502884f
+
+/*
+ * M_PI_2: pi / 2, as a 32 bit float
+ *
+ * Pi divided by 2, as a 32 bit float.
+ */
+#define M_PI_2 1.570796326794896619231321691639751442f
+
+/*
+ * M_PI_4: pi / 4, as a 32 bit float
+ *
+ * Pi divided by 4, as a 32 bit float.
+ */
+#define M_PI_4 0.785398163397448309615660845819875721f
+
+/*
+ * M_SQRT1_2: 1 / sqrt(2), as a 32 bit float
+ *
+ * The inverse of the square root of 2, as a 32 bit float.
+ */
+#define M_SQRT1_2 0.707106781186547524400844362104849039f
+
+/*
+ * M_SQRT2: sqrt(2), as a 32 bit float
+ *
+ * The square root of 2, as a 32 bit float.
+ */
+#define M_SQRT2 1.414213562373095048801688724209698079f
+
+/*
+ * abs: Absolute value of an integer
+ *
+ * Returns the absolute value of an integer.
+ *
+ * For floats, use fabs().
+ */
+extern uchar __attribute__((const, overloadable))
+    abs(char v);
+
+extern uchar2 __attribute__((const, overloadable))
+    abs(char2 v);
+
+extern uchar3 __attribute__((const, overloadable))
+    abs(char3 v);
+
+extern uchar4 __attribute__((const, overloadable))
+    abs(char4 v);
+
+extern ushort __attribute__((const, overloadable))
+    abs(short v);
+
+extern ushort2 __attribute__((const, overloadable))
+    abs(short2 v);
+
+extern ushort3 __attribute__((const, overloadable))
+    abs(short3 v);
+
+extern ushort4 __attribute__((const, overloadable))
+    abs(short4 v);
+
+extern uint __attribute__((const, overloadable))
+    abs(int v);
+
+extern uint2 __attribute__((const, overloadable))
+    abs(int2 v);
+
+extern uint3 __attribute__((const, overloadable))
+    abs(int3 v);
+
+extern uint4 __attribute__((const, overloadable))
+    abs(int4 v);
+
+/*
+ * acos: Inverse cosine
+ *
+ * Returns the inverse cosine, in radians.
+ *
+ * See also native_acos().
+ */
+extern float __attribute__((const, overloadable))
+    acos(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acos(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acos(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acos(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    acos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    acos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    acos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    acos(half4 v);
+#endif
+
+/*
+ * acosh: Inverse hyperbolic cosine
+ *
+ * Returns the inverse hyperbolic cosine, in radians.
+ *
+ * See also native_acosh().
+ */
+extern float __attribute__((const, overloadable))
+    acosh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acosh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acosh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acosh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    acosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    acosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    acosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    acosh(half4 v);
+#endif
+
+/*
+ * acospi: Inverse cosine divided by pi
+ *
+ * Returns the inverse cosine in radians, divided by pi.
+ *
+ * To get an inverse cosine measured in degrees, use acospi(a) * 180.f.
+ *
+ * See also native_acospi().
+ */
+extern float __attribute__((const, overloadable))
+    acospi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    acospi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    acospi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    acospi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    acospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    acospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    acospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    acospi(half4 v);
+#endif
+
+/*
+ * asin: Inverse sine
+ *
+ * Returns the inverse sine, in radians.
+ *
+ * See also native_asin().
+ */
+extern float __attribute__((const, overloadable))
+    asin(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asin(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asin(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asin(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    asin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    asin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    asin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    asin(half4 v);
+#endif
+
+/*
+ * asinh: Inverse hyperbolic sine
+ *
+ * Returns the inverse hyperbolic sine, in radians.
+ *
+ * See also native_asinh().
+ */
+extern float __attribute__((const, overloadable))
+    asinh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asinh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asinh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asinh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    asinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    asinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    asinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    asinh(half4 v);
+#endif
+
+/*
+ * asinpi: Inverse sine divided by pi
+ *
+ * Returns the inverse sine in radians, divided by pi.
+ *
+ * To get an inverse sine measured in degrees, use asinpi(a) * 180.f.
+ *
+ * See also native_asinpi().
+ */
+extern float __attribute__((const, overloadable))
+    asinpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    asinpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    asinpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    asinpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    asinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    asinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    asinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    asinpi(half4 v);
+#endif
+
+/*
+ * atan: Inverse tangent
+ *
+ * Returns the inverse tangent, in radians.
+ *
+ * See also native_atan().
+ */
+extern float __attribute__((const, overloadable))
+    atan(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atan(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atan(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atan(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atan(half4 v);
+#endif
+
+/*
+ * atan2: Inverse tangent of a ratio
+ *
+ * Returns the inverse tangent of (numerator / denominator), in radians.
+ *
+ * See also native_atan2().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+extern float __attribute__((const, overloadable))
+    atan2(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    atan2(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    atan2(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    atan2(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atan2(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atan2(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atan2(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atan2(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * atan2pi: Inverse tangent of a ratio, divided by pi
+ *
+ * Returns the inverse tangent of (numerator / denominator), in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atan2pi(n, d) * 180.f.
+ *
+ * See also native_atan2pi().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+extern float __attribute__((const, overloadable))
+    atan2pi(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    atan2pi(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    atan2pi(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    atan2pi(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atan2pi(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atan2pi(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atan2pi(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atan2pi(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * atanh: Inverse hyperbolic tangent
+ *
+ * Returns the inverse hyperbolic tangent, in radians.
+ *
+ * See also native_atanh().
+ */
+extern float __attribute__((const, overloadable))
+    atanh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atanh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atanh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atanh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atanh(half4 v);
+#endif
+
+/*
+ * atanpi: Inverse tangent divided by pi
+ *
+ * Returns the inverse tangent in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atanpi(a) * 180.f.
+ *
+ * See also native_atanpi().
+ */
+extern float __attribute__((const, overloadable))
+    atanpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    atanpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    atanpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    atanpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    atanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    atanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    atanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    atanpi(half4 v);
+#endif
+
+/*
+ * cbrt: Cube root
+ *
+ * Returns the cube root.
+ *
+ * See also native_cbrt().
+ */
+extern float __attribute__((const, overloadable))
+    cbrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cbrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cbrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cbrt(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cbrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cbrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cbrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cbrt(half4 v);
+#endif
+
+/*
+ * ceil: Smallest integer not less than a value
+ *
+ * Returns the smallest integer not less than a value.
+ *
+ * For example, ceil(1.2f) returns 2.f, and ceil(-1.2f) returns -1.f.
+ *
+ * See also floor().
+ */
+extern float __attribute__((const, overloadable))
+    ceil(float v);
+
+extern float2 __attribute__((const, overloadable))
+    ceil(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    ceil(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    ceil(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    ceil(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    ceil(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    ceil(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    ceil(half4 v);
+#endif
+
+/*
+ * clamp: Restrain a value to a range
+ *
+ * Clamps a value to a specified high and low bound.  clamp() returns min_value
+ * if value < min_value, max_value if value > max_value, otherwise value.
+ *
+ * There are two variants of clamp: one where the min and max are scalars applied
+ * to all entries of the value, the other where the min and max are also vectors.
+ *
+ * If min_value is greater than max_value, the results are undefined.
+ *
+ * Parameters:
+ *   value: Value to be clamped.
+ *   min_value: Lower bound, a scalar or matching vector.
+ *   max_value: High bound, must match the type of low.
+ */
+extern float __attribute__((const, overloadable))
+    clamp(float value, float min_value, float max_value);
+
+extern float2 __attribute__((const, overloadable))
+    clamp(float2 value, float2 min_value, float2 max_value);
+
+extern float3 __attribute__((const, overloadable))
+    clamp(float3 value, float3 min_value, float3 max_value);
+
+extern float4 __attribute__((const, overloadable))
+    clamp(float4 value, float4 min_value, float4 max_value);
+
+extern float2 __attribute__((const, overloadable))
+    clamp(float2 value, float min_value, float max_value);
+
+extern float3 __attribute__((const, overloadable))
+    clamp(float3 value, float min_value, float max_value);
+
+extern float4 __attribute__((const, overloadable))
+    clamp(float4 value, float min_value, float max_value);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char __attribute__((const, overloadable))
+    clamp(char value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char2 __attribute__((const, overloadable))
+    clamp(char2 value, char2 min_value, char2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char3 __attribute__((const, overloadable))
+    clamp(char3 value, char3 min_value, char3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char4 __attribute__((const, overloadable))
+    clamp(char4 value, char4 min_value, char4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar __attribute__((const, overloadable))
+    clamp(uchar value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar2 __attribute__((const, overloadable))
+    clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar3 __attribute__((const, overloadable))
+    clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar4 __attribute__((const, overloadable))
+    clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short __attribute__((const, overloadable))
+    clamp(short value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short2 __attribute__((const, overloadable))
+    clamp(short2 value, short2 min_value, short2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short3 __attribute__((const, overloadable))
+    clamp(short3 value, short3 min_value, short3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short4 __attribute__((const, overloadable))
+    clamp(short4 value, short4 min_value, short4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort __attribute__((const, overloadable))
+    clamp(ushort value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort2 __attribute__((const, overloadable))
+    clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort3 __attribute__((const, overloadable))
+    clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort4 __attribute__((const, overloadable))
+    clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int __attribute__((const, overloadable))
+    clamp(int value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int2 __attribute__((const, overloadable))
+    clamp(int2 value, int2 min_value, int2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int3 __attribute__((const, overloadable))
+    clamp(int3 value, int3 min_value, int3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int4 __attribute__((const, overloadable))
+    clamp(int4 value, int4 min_value, int4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint __attribute__((const, overloadable))
+    clamp(uint value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint2 __attribute__((const, overloadable))
+    clamp(uint2 value, uint2 min_value, uint2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint3 __attribute__((const, overloadable))
+    clamp(uint3 value, uint3 min_value, uint3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint4 __attribute__((const, overloadable))
+    clamp(uint4 value, uint4 min_value, uint4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long __attribute__((const, overloadable))
+    clamp(long value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long2 __attribute__((const, overloadable))
+    clamp(long2 value, long2 min_value, long2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long3 __attribute__((const, overloadable))
+    clamp(long3 value, long3 min_value, long3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long4 __attribute__((const, overloadable))
+    clamp(long4 value, long4 min_value, long4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong __attribute__((const, overloadable))
+    clamp(ulong value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong2 __attribute__((const, overloadable))
+    clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong3 __attribute__((const, overloadable))
+    clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong4 __attribute__((const, overloadable))
+    clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char2 __attribute__((const, overloadable))
+    clamp(char2 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char3 __attribute__((const, overloadable))
+    clamp(char3 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern char4 __attribute__((const, overloadable))
+    clamp(char4 value, char min_value, char max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar2 __attribute__((const, overloadable))
+    clamp(uchar2 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar3 __attribute__((const, overloadable))
+    clamp(uchar3 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uchar4 __attribute__((const, overloadable))
+    clamp(uchar4 value, uchar min_value, uchar max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short2 __attribute__((const, overloadable))
+    clamp(short2 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short3 __attribute__((const, overloadable))
+    clamp(short3 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern short4 __attribute__((const, overloadable))
+    clamp(short4 value, short min_value, short max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort2 __attribute__((const, overloadable))
+    clamp(ushort2 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort3 __attribute__((const, overloadable))
+    clamp(ushort3 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ushort4 __attribute__((const, overloadable))
+    clamp(ushort4 value, ushort min_value, ushort max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int2 __attribute__((const, overloadable))
+    clamp(int2 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int3 __attribute__((const, overloadable))
+    clamp(int3 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern int4 __attribute__((const, overloadable))
+    clamp(int4 value, int min_value, int max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint2 __attribute__((const, overloadable))
+    clamp(uint2 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint3 __attribute__((const, overloadable))
+    clamp(uint3 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern uint4 __attribute__((const, overloadable))
+    clamp(uint4 value, uint min_value, uint max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long2 __attribute__((const, overloadable))
+    clamp(long2 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long3 __attribute__((const, overloadable))
+    clamp(long3 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern long4 __attribute__((const, overloadable))
+    clamp(long4 value, long min_value, long max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong2 __attribute__((const, overloadable))
+    clamp(ulong2 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong3 __attribute__((const, overloadable))
+    clamp(ulong3 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 19))
+extern ulong4 __attribute__((const, overloadable))
+    clamp(ulong4 value, ulong min_value, ulong max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    clamp(half value, half min_value, half max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    clamp(half2 value, half2 min_value, half2 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    clamp(half3 value, half3 min_value, half3 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    clamp(half4 value, half4 min_value, half4 max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    clamp(half2 value, half min_value, half max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    clamp(half3 value, half min_value, half max_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    clamp(half4 value, half min_value, half max_value);
+#endif
+
+/*
+ * clz: Number of leading 0 bits
+ *
+ * Returns the number of leading 0-bits in a value.
+ *
+ * For example, clz((char)0x03) returns 6.
+ */
+extern char __attribute__((const, overloadable))
+    clz(char value);
+
+extern char2 __attribute__((const, overloadable))
+    clz(char2 value);
+
+extern char3 __attribute__((const, overloadable))
+    clz(char3 value);
+
+extern char4 __attribute__((const, overloadable))
+    clz(char4 value);
+
+extern uchar __attribute__((const, overloadable))
+    clz(uchar value);
+
+extern uchar2 __attribute__((const, overloadable))
+    clz(uchar2 value);
+
+extern uchar3 __attribute__((const, overloadable))
+    clz(uchar3 value);
+
+extern uchar4 __attribute__((const, overloadable))
+    clz(uchar4 value);
+
+extern short __attribute__((const, overloadable))
+    clz(short value);
+
+extern short2 __attribute__((const, overloadable))
+    clz(short2 value);
+
+extern short3 __attribute__((const, overloadable))
+    clz(short3 value);
+
+extern short4 __attribute__((const, overloadable))
+    clz(short4 value);
+
+extern ushort __attribute__((const, overloadable))
+    clz(ushort value);
+
+extern ushort2 __attribute__((const, overloadable))
+    clz(ushort2 value);
+
+extern ushort3 __attribute__((const, overloadable))
+    clz(ushort3 value);
+
+extern ushort4 __attribute__((const, overloadable))
+    clz(ushort4 value);
+
+extern int __attribute__((const, overloadable))
+    clz(int value);
+
+extern int2 __attribute__((const, overloadable))
+    clz(int2 value);
+
+extern int3 __attribute__((const, overloadable))
+    clz(int3 value);
+
+extern int4 __attribute__((const, overloadable))
+    clz(int4 value);
+
+extern uint __attribute__((const, overloadable))
+    clz(uint value);
+
+extern uint2 __attribute__((const, overloadable))
+    clz(uint2 value);
+
+extern uint3 __attribute__((const, overloadable))
+    clz(uint3 value);
+
+extern uint4 __attribute__((const, overloadable))
+    clz(uint4 value);
+
+/*
+ * copysign: Copies the sign of a number to another
+ *
+ * Copies the sign from sign_value to magnitude_value.
+ *
+ * The value returned is either magnitude_value or -magnitude_value.
+ *
+ * For example, copysign(4.0f, -2.7f) returns -4.0f and copysign(-4.0f, 2.7f) returns 4.0f.
+ */
+extern float __attribute__((const, overloadable))
+    copysign(float magnitude_value, float sign_value);
+
+extern float2 __attribute__((const, overloadable))
+    copysign(float2 magnitude_value, float2 sign_value);
+
+extern float3 __attribute__((const, overloadable))
+    copysign(float3 magnitude_value, float3 sign_value);
+
+extern float4 __attribute__((const, overloadable))
+    copysign(float4 magnitude_value, float4 sign_value);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    copysign(half magnitude_value, half sign_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    copysign(half2 magnitude_value, half2 sign_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    copysign(half3 magnitude_value, half3 sign_value);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    copysign(half4 magnitude_value, half4 sign_value);
+#endif
+
+/*
+ * cos: Cosine
+ *
+ * Returns the cosine of an angle measured in radians.
+ *
+ * See also native_cos().
+ */
+extern float __attribute__((const, overloadable))
+    cos(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cos(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cos(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cos(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cos(half4 v);
+#endif
+
+/*
+ * cosh: Hypebolic cosine
+ *
+ * Returns the hypebolic cosine of v, where v is measured in radians.
+ *
+ * See also native_cosh().
+ */
+extern float __attribute__((const, overloadable))
+    cosh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cosh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cosh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cosh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cosh(half4 v);
+#endif
+
+/*
+ * cospi: Cosine of a number multiplied by pi
+ *
+ * Returns the cosine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the cosine of a value measured in degrees, call cospi(v / 180.f).
+ *
+ * See also native_cospi().
+ */
+extern float __attribute__((const, overloadable))
+    cospi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    cospi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    cospi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    cospi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    cospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    cospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cospi(half4 v);
+#endif
+
+/*
+ * degrees: Converts radians into degrees
+ *
+ * Converts from radians to degrees.
+ */
+extern float __attribute__((const, overloadable))
+    degrees(float v);
+
+extern float2 __attribute__((const, overloadable))
+    degrees(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    degrees(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    degrees(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    degrees(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    degrees(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    degrees(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    degrees(half4 v);
+#endif
+
+/*
+ * erf: Mathematical error function
+ *
+ * Returns the error function.
+ */
+extern float __attribute__((const, overloadable))
+    erf(float v);
+
+extern float2 __attribute__((const, overloadable))
+    erf(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    erf(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    erf(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    erf(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    erf(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    erf(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    erf(half4 v);
+#endif
+
+/*
+ * erfc: Mathematical complementary error function
+ *
+ * Returns the complementary error function.
+ */
+extern float __attribute__((const, overloadable))
+    erfc(float v);
+
+extern float2 __attribute__((const, overloadable))
+    erfc(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    erfc(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    erfc(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    erfc(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    erfc(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    erfc(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    erfc(half4 v);
+#endif
+
+/*
+ * exp: e raised to a number
+ *
+ * Returns e raised to v, i.e. e ^ v.
+ *
+ * See also native_exp().
+ */
+extern float __attribute__((const, overloadable))
+    exp(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    exp(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    exp(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    exp(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    exp(half4 v);
+#endif
+
+/*
+ * exp10: 10 raised to a number
+ *
+ * Returns 10 raised to v, i.e. 10.f ^ v.
+ *
+ * See also native_exp10().
+ */
+extern float __attribute__((const, overloadable))
+    exp10(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp10(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp10(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp10(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    exp10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    exp10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    exp10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    exp10(half4 v);
+#endif
+
+/*
+ * exp2: 2 raised to a number
+ *
+ * Returns 2 raised to v, i.e. 2.f ^ v.
+ *
+ * See also native_exp2().
+ */
+extern float __attribute__((const, overloadable))
+    exp2(float v);
+
+extern float2 __attribute__((const, overloadable))
+    exp2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    exp2(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    exp2(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    exp2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    exp2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    exp2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    exp2(half4 v);
+#endif
+
+/*
+ * expm1: e raised to a number minus one
+ *
+ * Returns e raised to v minus 1, i.e. (e ^ v) - 1.
+ *
+ * See also native_expm1().
+ */
+extern float __attribute__((const, overloadable))
+    expm1(float v);
+
+extern float2 __attribute__((const, overloadable))
+    expm1(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    expm1(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    expm1(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    expm1(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    expm1(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    expm1(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    expm1(half4 v);
+#endif
+
+/*
+ * fabs: Absolute value of a float
+ *
+ * Returns the absolute value of the float v.
+ *
+ * For integers, use abs().
+ */
+extern float __attribute__((const, overloadable))
+    fabs(float v);
+
+extern float2 __attribute__((const, overloadable))
+    fabs(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    fabs(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    fabs(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fabs(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fabs(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fabs(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fabs(half4 v);
+#endif
+
+/*
+ * fdim: Positive difference between two values
+ *
+ * Returns the positive difference between two values.
+ *
+ * If a > b, returns (a - b) otherwise returns 0f.
+ */
+extern float __attribute__((const, overloadable))
+    fdim(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fdim(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fdim(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fdim(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fdim(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fdim(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fdim(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fdim(half4 a, half4 b);
+#endif
+
+/*
+ * floor: Smallest integer not greater than a value
+ *
+ * Returns the smallest integer not greater than a value.
+ *
+ * For example, floor(1.2f) returns 1.f, and floor(-1.2f) returns -2.f.
+ *
+ * See also ceil().
+ */
+extern float __attribute__((const, overloadable))
+    floor(float v);
+
+extern float2 __attribute__((const, overloadable))
+    floor(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    floor(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    floor(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    floor(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    floor(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    floor(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    floor(half4 v);
+#endif
+
+/*
+ * fma: Multiply and add
+ *
+ * Multiply and add.  Returns (multiplicand1 * multiplicand2) + offset.
+ *
+ * This function is similar to mad().  fma() retains full precision of the multiplied result
+ * and rounds only after the addition.  mad() rounds after the multiplication and the addition.
+ * This extra precision is not guaranteed in rs_fp_relaxed mode.
+ */
+extern float __attribute__((const, overloadable))
+    fma(float multiplicand1, float multiplicand2, float offset);
+
+extern float2 __attribute__((const, overloadable))
+    fma(float2 multiplicand1, float2 multiplicand2, float2 offset);
+
+extern float3 __attribute__((const, overloadable))
+    fma(float3 multiplicand1, float3 multiplicand2, float3 offset);
+
+extern float4 __attribute__((const, overloadable))
+    fma(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fma(half multiplicand1, half multiplicand2, half offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fma(half2 multiplicand1, half2 multiplicand2, half2 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fma(half3 multiplicand1, half3 multiplicand2, half3 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fma(half4 multiplicand1, half4 multiplicand2, half4 offset);
+#endif
+
+/*
+ * fmax: Maximum of two floats
+ *
+ * Returns the maximum of a and b, i.e. (a < b ? b : a).
+ *
+ * The max() function returns identical results but can be applied to more data types.
+ */
+extern float __attribute__((const, overloadable))
+    fmax(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fmax(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fmax(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fmax(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fmax(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmax(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmax(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmax(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    fmax(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    fmax(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    fmax(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmax(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmax(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmax(half4 a, half b);
+#endif
+
+/*
+ * fmin: Minimum of two floats
+ *
+ * Returns the minimum of a and b, i.e. (a > b ? b : a).
+ *
+ * The min() function returns identical results but can be applied to more data types.
+ */
+extern float __attribute__((const, overloadable))
+    fmin(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    fmin(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    fmin(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    fmin(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fmin(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmin(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmin(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmin(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    fmin(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    fmin(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    fmin(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmin(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmin(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmin(half4 a, half b);
+#endif
+
+/*
+ * fmod: Modulo
+ *
+ * Returns the remainder of (numerator / denominator), where the quotient is rounded towards zero.
+ *
+ * The function remainder() is similar but rounds toward the closest interger.
+ * For example, fmod(-3.8f, 2.f) returns -1.8f (-3.8f - -1.f * 2.f)
+ * while remainder(-3.8f, 2.f) returns 0.2f (-3.8f - -2.f * 2.f).
+ */
+extern float __attribute__((const, overloadable))
+    fmod(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    fmod(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    fmod(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    fmod(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    fmod(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    fmod(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    fmod(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    fmod(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * fract: Positive fractional part
+ *
+ * Returns the positive fractional part of v, i.e. v - floor(v).
+ *
+ * For example, fract(1.3f, &val) returns 0.3f and sets val to 1.f.
+ * fract(-1.3f, &val) returns 0.7f and sets val to -2.f.
+ *
+ * Parameters:
+ *   v: Input value.
+ *   floor: If floor is not null, *floor will be set to the floor of v.
+ */
+extern float __attribute__((overloadable))
+    fract(float v, float* floor);
+
+extern float2 __attribute__((overloadable))
+    fract(float2 v, float2* floor);
+
+extern float3 __attribute__((overloadable))
+    fract(float3 v, float3* floor);
+
+extern float4 __attribute__((overloadable))
+    fract(float4 v, float4* floor);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float __attribute__((const, overloadable))
+    fract(float v) {
+    float unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float2 __attribute__((const, overloadable))
+    fract(float2 v) {
+    float2 unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float3 __attribute__((const, overloadable))
+    fract(float3 v) {
+    float3 unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float4 __attribute__((const, overloadable))
+    fract(float4 v) {
+    float4 unused;
+    return fract(v, &unused);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float __attribute__((overloadable))
+    fract(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float2 __attribute__((overloadable))
+    fract(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float3 __attribute__((overloadable))
+    fract(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float4 __attribute__((overloadable))
+    fract(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    fract(half v, half* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    fract(half2 v, half2* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    fract(half3 v, half3* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    fract(half4 v, half4* floor);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    fract(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    fract(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    fract(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    fract(half4 v);
+#endif
+
+/*
+ * frexp: Binary mantissa and exponent
+ *
+ * Returns the binary mantissa and exponent of v, i.e. v == mantissa * 2 ^ exponent.
+ *
+ * The mantissa is always between 0.5 (inclusive) and 1.0 (exclusive).
+ *
+ * See ldexp() for the reverse operation.  See also logb() and ilogb().
+ *
+ * Parameters:
+ *   v: Input value.
+ *   exponent: If exponent is not null, *exponent will be set to the exponent of v.
+ */
+extern float __attribute__((overloadable))
+    frexp(float v, int* exponent);
+
+extern float2 __attribute__((overloadable))
+    frexp(float2 v, int2* exponent);
+
+extern float3 __attribute__((overloadable))
+    frexp(float3 v, int3* exponent);
+
+extern float4 __attribute__((overloadable))
+    frexp(float4 v, int4* exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    frexp(half v, int* exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    frexp(half2 v, int2* exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    frexp(half3 v, int3* exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    frexp(half4 v, int4* exponent);
+#endif
+
+/*
+ * half_recip: Reciprocal computed to 16 bit precision
+ *
+ * Returns the approximate reciprocal of a value.
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also native_recip().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_recip(float4 v);
+#endif
+
+/*
+ * half_rsqrt: Reciprocal of a square root computed to 16 bit precision
+ *
+ * Returns the approximate value of (1.f / sqrt(value)).
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also rsqrt(), native_rsqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_rsqrt(float4 v);
+#endif
+
+/*
+ * half_sqrt: Square root computed to 16 bit precision
+ *
+ * Returns the approximate square root of a value.
+ *
+ * The precision is that of a 16 bit floating point value.
+ *
+ * See also sqrt(), native_sqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    half_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    half_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    half_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    half_sqrt(float4 v);
+#endif
+
+/*
+ * hypot: Hypotenuse
+ *
+ * Returns the hypotenuse, i.e. sqrt(a * a + b * b).
+ *
+ * See also native_hypot().
+ */
+extern float __attribute__((const, overloadable))
+    hypot(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    hypot(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    hypot(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    hypot(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    hypot(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    hypot(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    hypot(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    hypot(half4 a, half4 b);
+#endif
+
+/*
+ * ilogb: Base two exponent
+ *
+ * Returns the base two exponent of a value, where the mantissa is between
+ * 1.f (inclusive) and 2.f (exclusive).
+ *
+ * For example, ilogb(8.5f) returns 3.
+ *
+ * Because of the difference in mantissa, this number is one less than is returned by frexp().
+ *
+ * logb() is similar but returns a float.
+ */
+extern int __attribute__((const, overloadable))
+    ilogb(float v);
+
+extern int2 __attribute__((const, overloadable))
+    ilogb(float2 v);
+
+extern int3 __attribute__((const, overloadable))
+    ilogb(float3 v);
+
+extern int4 __attribute__((const, overloadable))
+    ilogb(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int __attribute__((const, overloadable))
+    ilogb(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int2 __attribute__((const, overloadable))
+    ilogb(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int3 __attribute__((const, overloadable))
+    ilogb(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern int4 __attribute__((const, overloadable))
+    ilogb(half4 v);
+#endif
+
+/*
+ * ldexp: Creates a floating point from mantissa and exponent
+ *
+ * Returns the floating point created from the mantissa and exponent,
+ * i.e. (mantissa * 2 ^ exponent).
+ *
+ * See frexp() for the reverse operation.
+ *
+ * Parameters:
+ *   mantissa: Mantissa.
+ *   exponent: Exponent, a single component or matching vector.
+ */
+extern float __attribute__((const, overloadable))
+    ldexp(float mantissa, int exponent);
+
+extern float2 __attribute__((const, overloadable))
+    ldexp(float2 mantissa, int2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    ldexp(float3 mantissa, int3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    ldexp(float4 mantissa, int4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    ldexp(half mantissa, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    ldexp(half2 mantissa, int2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    ldexp(half3 mantissa, int3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    ldexp(half4 mantissa, int4 exponent);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    ldexp(float2 mantissa, int exponent);
+
+extern float3 __attribute__((const, overloadable))
+    ldexp(float3 mantissa, int exponent);
+
+extern float4 __attribute__((const, overloadable))
+    ldexp(float4 mantissa, int exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    ldexp(half2 mantissa, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    ldexp(half3 mantissa, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    ldexp(half4 mantissa, int exponent);
+#endif
+
+/*
+ * lgamma: Natural logarithm of the gamma function
+ *
+ * Returns the natural logarithm of the absolute value of the gamma function,
+ * i.e. log(fabs(tgamma(v))).
+ *
+ * See also tgamma().
+ *
+ * Parameters:
+ *   sign_of_gamma: If sign_of_gamma is not null, *sign_of_gamma will be set to -1.f if the gamma of v is negative, otherwise to 1.f.
+ */
+extern float __attribute__((const, overloadable))
+    lgamma(float v);
+
+extern float2 __attribute__((const, overloadable))
+    lgamma(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    lgamma(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    lgamma(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    lgamma(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    lgamma(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    lgamma(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    lgamma(half4 v);
+#endif
+
+extern float __attribute__((overloadable))
+    lgamma(float v, int* sign_of_gamma);
+
+extern float2 __attribute__((overloadable))
+    lgamma(float2 v, int2* sign_of_gamma);
+
+extern float3 __attribute__((overloadable))
+    lgamma(float3 v, int3* sign_of_gamma);
+
+extern float4 __attribute__((overloadable))
+    lgamma(float4 v, int4* sign_of_gamma);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    lgamma(half v, int* sign_of_gamma);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    lgamma(half2 v, int2* sign_of_gamma);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    lgamma(half3 v, int3* sign_of_gamma);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    lgamma(half4 v, int4* sign_of_gamma);
+#endif
+
+/*
+ * log: Natural logarithm
+ *
+ * Returns the natural logarithm.
+ *
+ * See also native_log().
+ */
+extern float __attribute__((const, overloadable))
+    log(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log(half4 v);
+#endif
+
+/*
+ * log10: Base 10 logarithm
+ *
+ * Returns the base 10 logarithm.
+ *
+ * See also native_log10().
+ */
+extern float __attribute__((const, overloadable))
+    log10(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log10(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log10(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log10(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log10(half4 v);
+#endif
+
+/*
+ * log1p: Natural logarithm of a value plus 1
+ *
+ * Returns the natural logarithm of (v + 1.f).
+ *
+ * See also native_log1p().
+ */
+extern float __attribute__((const, overloadable))
+    log1p(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log1p(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log1p(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log1p(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log1p(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log1p(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log1p(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log1p(half4 v);
+#endif
+
+/*
+ * log2: Base 2 logarithm
+ *
+ * Returns the base 2 logarithm.
+ *
+ * See also native_log2().
+ */
+extern float __attribute__((const, overloadable))
+    log2(float v);
+
+extern float2 __attribute__((const, overloadable))
+    log2(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    log2(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    log2(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    log2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    log2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    log2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    log2(half4 v);
+#endif
+
+/*
+ * logb: Base two exponent
+ *
+ * Returns the base two exponent of a value, where the mantissa is between
+ * 1.f (inclusive) and 2.f (exclusive).
+ *
+ * For example, logb(8.5f) returns 3.f.
+ *
+ * Because of the difference in mantissa, this number is one less than is returned by frexp().
+ *
+ * ilogb() is similar but returns an integer.
+ */
+extern float __attribute__((const, overloadable))
+    logb(float v);
+
+extern float2 __attribute__((const, overloadable))
+    logb(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    logb(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    logb(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    logb(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    logb(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    logb(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    logb(half4 v);
+#endif
+
+/*
+ * mad: Multiply and add
+ *
+ * Multiply and add.  Returns (multiplicand1 * multiplicand2) + offset.
+ *
+ * This function is similar to fma().  fma() retains full precision of the multiplied result
+ * and rounds only after the addition.  mad() rounds after the multiplication and the addition.
+ * In rs_fp_relaxed mode, mad() may not do the rounding after multiplicaiton.
+ */
+extern float __attribute__((const, overloadable))
+    mad(float multiplicand1, float multiplicand2, float offset);
+
+extern float2 __attribute__((const, overloadable))
+    mad(float2 multiplicand1, float2 multiplicand2, float2 offset);
+
+extern float3 __attribute__((const, overloadable))
+    mad(float3 multiplicand1, float3 multiplicand2, float3 offset);
+
+extern float4 __attribute__((const, overloadable))
+    mad(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    mad(half multiplicand1, half multiplicand2, half offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    mad(half2 multiplicand1, half2 multiplicand2, half2 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    mad(half3 multiplicand1, half3 multiplicand2, half3 offset);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    mad(half4 multiplicand1, half4 multiplicand2, half4 offset);
+#endif
+
+/*
+ * max: Maximum
+ *
+ * Returns the maximum value of two arguments.
+ */
+extern float __attribute__((const, overloadable))
+    max(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    max(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    max(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    max(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    max(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    max(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    max(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    max(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    max(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    max(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    max(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    max(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    max(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    max(half4 a, half b);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char __attribute__((const, overloadable))
+    max(char a, char b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar __attribute__((const, overloadable))
+    max(uchar a, uchar b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short __attribute__((const, overloadable))
+    max(short a, short b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort __attribute__((const, overloadable))
+    max(ushort a, ushort b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int __attribute__((const, overloadable))
+    max(int a, int b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint __attribute__((const, overloadable))
+    max(uint a, uint b) {
+    return (a > b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char2 __attribute__((const, overloadable))
+    max(char2 a, char2 b) {
+    char2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar2 __attribute__((const, overloadable))
+    max(uchar2 a, uchar2 b) {
+    uchar2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short2 __attribute__((const, overloadable))
+    max(short2 a, short2 b) {
+    short2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort2 __attribute__((const, overloadable))
+    max(ushort2 a, ushort2 b) {
+    ushort2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int2 __attribute__((const, overloadable))
+    max(int2 a, int2 b) {
+    int2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint2 __attribute__((const, overloadable))
+    max(uint2 a, uint2 b) {
+    uint2 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char3 __attribute__((const, overloadable))
+    max(char3 a, char3 b) {
+    char3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar3 __attribute__((const, overloadable))
+    max(uchar3 a, uchar3 b) {
+    uchar3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short3 __attribute__((const, overloadable))
+    max(short3 a, short3 b) {
+    short3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort3 __attribute__((const, overloadable))
+    max(ushort3 a, ushort3 b) {
+    ushort3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int3 __attribute__((const, overloadable))
+    max(int3 a, int3 b) {
+    int3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint3 __attribute__((const, overloadable))
+    max(uint3 a, uint3 b) {
+    uint3 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char4 __attribute__((const, overloadable))
+    max(char4 a, char4 b) {
+    char4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar4 __attribute__((const, overloadable))
+    max(uchar4 a, uchar4 b) {
+    uchar4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short4 __attribute__((const, overloadable))
+    max(short4 a, short4 b) {
+    short4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort4 __attribute__((const, overloadable))
+    max(ushort4 a, ushort4 b) {
+    ushort4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int4 __attribute__((const, overloadable))
+    max(int4 a, int4 b) {
+    int4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint4 __attribute__((const, overloadable))
+    max(uint4 a, uint4 b) {
+    uint4 tmp;
+    tmp.x = (a.x > b.x ? a.x : b.x);
+    tmp.y = (a.y > b.y ? a.y : b.y);
+    tmp.z = (a.z > b.z ? a.z : b.z);
+    tmp.w = (a.w > b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char __attribute__((const, overloadable))
+    max(char a, char b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    max(char2 a, char2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    max(char3 a, char3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    max(char4 a, char4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar __attribute__((const, overloadable))
+    max(uchar a, uchar b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    max(uchar2 a, uchar2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    max(uchar3 a, uchar3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    max(uchar4 a, uchar4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short __attribute__((const, overloadable))
+    max(short a, short b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    max(short2 a, short2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    max(short3 a, short3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    max(short4 a, short4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort __attribute__((const, overloadable))
+    max(ushort a, ushort b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    max(ushort2 a, ushort2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    max(ushort3 a, ushort3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    max(ushort4 a, ushort4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int __attribute__((const, overloadable))
+    max(int a, int b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    max(int2 a, int2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    max(int3 a, int3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    max(int4 a, int4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint __attribute__((const, overloadable))
+    max(uint a, uint b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    max(uint2 a, uint2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    max(uint3 a, uint3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    max(uint4 a, uint4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long __attribute__((const, overloadable))
+    max(long a, long b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    max(long2 a, long2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    max(long3 a, long3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    max(long4 a, long4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong __attribute__((const, overloadable))
+    max(ulong a, ulong b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    max(ulong2 a, ulong2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    max(ulong3 a, ulong3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    max(ulong4 a, ulong4 b);
+#endif
+
+/*
+ * min: Minimum
+ *
+ * Returns the minimum value of two arguments.
+ */
+extern float __attribute__((const, overloadable))
+    min(float a, float b);
+
+extern float2 __attribute__((const, overloadable))
+    min(float2 a, float2 b);
+
+extern float3 __attribute__((const, overloadable))
+    min(float3 a, float3 b);
+
+extern float4 __attribute__((const, overloadable))
+    min(float4 a, float4 b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    min(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    min(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    min(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    min(half4 a, half4 b);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    min(float2 a, float b);
+
+extern float3 __attribute__((const, overloadable))
+    min(float3 a, float b);
+
+extern float4 __attribute__((const, overloadable))
+    min(float4 a, float b);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    min(half2 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    min(half3 a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    min(half4 a, half b);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char __attribute__((const, overloadable))
+    min(char a, char b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar __attribute__((const, overloadable))
+    min(uchar a, uchar b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short __attribute__((const, overloadable))
+    min(short a, short b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort __attribute__((const, overloadable))
+    min(ushort a, ushort b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int __attribute__((const, overloadable))
+    min(int a, int b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint __attribute__((const, overloadable))
+    min(uint a, uint b) {
+    return (a < b ? a : b);
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char2 __attribute__((const, overloadable))
+    min(char2 a, char2 b) {
+    char2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar2 __attribute__((const, overloadable))
+    min(uchar2 a, uchar2 b) {
+    uchar2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short2 __attribute__((const, overloadable))
+    min(short2 a, short2 b) {
+    short2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort2 __attribute__((const, overloadable))
+    min(ushort2 a, ushort2 b) {
+    ushort2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int2 __attribute__((const, overloadable))
+    min(int2 a, int2 b) {
+    int2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint2 __attribute__((const, overloadable))
+    min(uint2 a, uint2 b) {
+    uint2 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char3 __attribute__((const, overloadable))
+    min(char3 a, char3 b) {
+    char3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar3 __attribute__((const, overloadable))
+    min(uchar3 a, uchar3 b) {
+    uchar3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short3 __attribute__((const, overloadable))
+    min(short3 a, short3 b) {
+    short3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort3 __attribute__((const, overloadable))
+    min(ushort3 a, ushort3 b) {
+    ushort3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int3 __attribute__((const, overloadable))
+    min(int3 a, int3 b) {
+    int3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint3 __attribute__((const, overloadable))
+    min(uint3 a, uint3 b) {
+    uint3 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline char4 __attribute__((const, overloadable))
+    min(char4 a, char4 b) {
+    char4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uchar4 __attribute__((const, overloadable))
+    min(uchar4 a, uchar4 b) {
+    uchar4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline short4 __attribute__((const, overloadable))
+    min(short4 a, short4 b) {
+    short4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline ushort4 __attribute__((const, overloadable))
+    min(ushort4 a, ushort4 b) {
+    ushort4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline int4 __attribute__((const, overloadable))
+    min(int4 a, int4 b) {
+    int4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+static inline uint4 __attribute__((const, overloadable))
+    min(uint4 a, uint4 b) {
+    uint4 tmp;
+    tmp.x = (a.x < b.x ? a.x : b.x);
+    tmp.y = (a.y < b.y ? a.y : b.y);
+    tmp.z = (a.z < b.z ? a.z : b.z);
+    tmp.w = (a.w < b.w ? a.w : b.w);
+    return tmp;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char __attribute__((const, overloadable))
+    min(char a, char b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char2 __attribute__((const, overloadable))
+    min(char2 a, char2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char3 __attribute__((const, overloadable))
+    min(char3 a, char3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern char4 __attribute__((const, overloadable))
+    min(char4 a, char4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar __attribute__((const, overloadable))
+    min(uchar a, uchar b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar2 __attribute__((const, overloadable))
+    min(uchar2 a, uchar2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar3 __attribute__((const, overloadable))
+    min(uchar3 a, uchar3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uchar4 __attribute__((const, overloadable))
+    min(uchar4 a, uchar4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short __attribute__((const, overloadable))
+    min(short a, short b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short2 __attribute__((const, overloadable))
+    min(short2 a, short2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short3 __attribute__((const, overloadable))
+    min(short3 a, short3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern short4 __attribute__((const, overloadable))
+    min(short4 a, short4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort __attribute__((const, overloadable))
+    min(ushort a, ushort b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort2 __attribute__((const, overloadable))
+    min(ushort2 a, ushort2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort3 __attribute__((const, overloadable))
+    min(ushort3 a, ushort3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ushort4 __attribute__((const, overloadable))
+    min(ushort4 a, ushort4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int __attribute__((const, overloadable))
+    min(int a, int b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int2 __attribute__((const, overloadable))
+    min(int2 a, int2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int3 __attribute__((const, overloadable))
+    min(int3 a, int3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern int4 __attribute__((const, overloadable))
+    min(int4 a, int4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint __attribute__((const, overloadable))
+    min(uint a, uint b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint2 __attribute__((const, overloadable))
+    min(uint2 a, uint2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint3 __attribute__((const, overloadable))
+    min(uint3 a, uint3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern uint4 __attribute__((const, overloadable))
+    min(uint4 a, uint4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long __attribute__((const, overloadable))
+    min(long a, long b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long2 __attribute__((const, overloadable))
+    min(long2 a, long2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long3 __attribute__((const, overloadable))
+    min(long3 a, long3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern long4 __attribute__((const, overloadable))
+    min(long4 a, long4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong __attribute__((const, overloadable))
+    min(ulong a, ulong b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong2 __attribute__((const, overloadable))
+    min(ulong2 a, ulong2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong3 __attribute__((const, overloadable))
+    min(ulong3 a, ulong3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern ulong4 __attribute__((const, overloadable))
+    min(ulong4 a, ulong4 b);
+#endif
+
+/*
+ * mix: Mixes two values
+ *
+ * Returns start + ((stop - start) * fraction).
+ *
+ * This can be useful for mixing two values.  For example, to create a new color that is
+ * 40% color1 and 60% color2, use mix(color1, color2, 0.6f).
+ */
+extern float __attribute__((const, overloadable))
+    mix(float start, float stop, float fraction);
+
+extern float2 __attribute__((const, overloadable))
+    mix(float2 start, float2 stop, float2 fraction);
+
+extern float3 __attribute__((const, overloadable))
+    mix(float3 start, float3 stop, float3 fraction);
+
+extern float4 __attribute__((const, overloadable))
+    mix(float4 start, float4 stop, float4 fraction);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    mix(half start, half stop, half fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    mix(half2 start, half2 stop, half2 fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    mix(half3 start, half3 stop, half3 fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    mix(half4 start, half4 stop, half4 fraction);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    mix(float2 start, float2 stop, float fraction);
+
+extern float3 __attribute__((const, overloadable))
+    mix(float3 start, float3 stop, float fraction);
+
+extern float4 __attribute__((const, overloadable))
+    mix(float4 start, float4 stop, float fraction);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    mix(half2 start, half2 stop, half fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    mix(half3 start, half3 stop, half fraction);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    mix(half4 start, half4 stop, half fraction);
+#endif
+
+/*
+ * modf: Integral and fractional components
+ *
+ * Returns the integral and fractional components of a number.
+ *
+ * Both components will have the same sign as x.  For example, for an input of -3.72f,
+ * *integral_part will be set to -3.f and .72f will be returned.
+ *
+ * Parameters:
+ *   v: Source value.
+ *   integral_part: *integral_part will be set to the integral portion of the number.
+ *
+ * Returns: Floating point portion of the value.
+ */
+extern float __attribute__((overloadable))
+    modf(float v, float* integral_part);
+
+extern float2 __attribute__((overloadable))
+    modf(float2 v, float2* integral_part);
+
+extern float3 __attribute__((overloadable))
+    modf(float3 v, float3* integral_part);
+
+extern float4 __attribute__((overloadable))
+    modf(float4 v, float4* integral_part);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    modf(half v, half* integral_part);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    modf(half2 v, half2* integral_part);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    modf(half3 v, half3* integral_part);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    modf(half4 v, half4* integral_part);
+#endif
+
+/*
+ * nan: Not a Number
+ *
+ * Returns a NaN value (Not a Number).
+ *
+ * Parameters:
+ *   v: Not used.
+ */
+extern float __attribute__((const, overloadable))
+    nan(uint v);
+
+/*
+ * nan_half: Not a Number
+ *
+ *  Returns a half-precision floating point NaN value (Not a Number).
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    nan_half(void);
+#endif
+
+/*
+ * native_acos: Approximate inverse cosine
+ *
+ * Returns the approximate inverse cosine, in radians.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also acos().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_acos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_acos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_acos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_acos(half4 v);
+#endif
+
+/*
+ * native_acosh: Approximate inverse hyperbolic cosine
+ *
+ * Returns the approximate inverse hyperbolic cosine, in radians.
+ *
+ * See also acosh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acosh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acosh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acosh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acosh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_acosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_acosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_acosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_acosh(half4 v);
+#endif
+
+/*
+ * native_acospi: Approximate inverse cosine divided by pi
+ *
+ * Returns the approximate inverse cosine in radians, divided by pi.
+ *
+ * To get an inverse cosine measured in degrees, use acospi(a) * 180.f.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also acospi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_acospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_acospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_acospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_acospi(half4 v);
+#endif
+
+/*
+ * native_asin: Approximate inverse sine
+ *
+ * Returns the approximate inverse sine, in radians.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also asin().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_asin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_asin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_asin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_asin(half4 v);
+#endif
+
+/*
+ * native_asinh: Approximate inverse hyperbolic sine
+ *
+ * Returns the approximate inverse hyperbolic sine, in radians.
+ *
+ * See also asinh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asinh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asinh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asinh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asinh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_asinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_asinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_asinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_asinh(half4 v);
+#endif
+
+/*
+ * native_asinpi: Approximate inverse sine divided by pi
+ *
+ * Returns the approximate inverse sine in radians, divided by pi.
+ *
+ * To get an inverse sine measured in degrees, use asinpi(a) * 180.f.
+ *
+ * This function yields undefined results from input values less than -1 or greater than 1.
+ *
+ * See also asinpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_asinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_asinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_asinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_asinpi(half4 v);
+#endif
+
+/*
+ * native_atan: Approximate inverse tangent
+ *
+ * Returns the approximate inverse tangent, in radians.
+ *
+ * See also atan().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atan(half4 v);
+#endif
+
+/*
+ * native_atan2: Approximate inverse tangent of a ratio
+ *
+ * Returns the approximate inverse tangent of (numerator / denominator), in radians.
+ *
+ * See also atan2().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan2(float numerator, float denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan2(float2 numerator, float2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan2(float3 numerator, float3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan2(float4 numerator, float4 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atan2(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atan2(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atan2(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atan2(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * native_atan2pi: Approximate inverse tangent of a ratio, divided by pi
+ *
+ * Returns the approximate inverse tangent of (numerator / denominator),
+ * in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atan2pi(n, d) * 180.f.
+ *
+ * See also atan2pi().
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.  Can be 0.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atan2pi(float numerator, float denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atan2pi(float2 numerator, float2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atan2pi(float3 numerator, float3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atan2pi(float4 numerator, float4 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atan2pi(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atan2pi(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atan2pi(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atan2pi(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * native_atanh: Approximate inverse hyperbolic tangent
+ *
+ * Returns the approximate inverse hyperbolic tangent, in radians.
+ *
+ * See also atanh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atanh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atanh(half4 v);
+#endif
+
+/*
+ * native_atanpi: Approximate inverse tangent divided by pi
+ *
+ * Returns the approximate inverse tangent in radians, divided by pi.
+ *
+ * To get an inverse tangent measured in degrees, use atanpi(a) * 180.f.
+ *
+ * See also atanpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_atanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_atanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_atanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_atanpi(half4 v);
+#endif
+
+/*
+ * native_cbrt: Approximate cube root
+ *
+ * Returns the approximate cubic root.
+ *
+ * See also cbrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cbrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cbrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cbrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cbrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cbrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cbrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cbrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cbrt(half4 v);
+#endif
+
+/*
+ * native_cos: Approximate cosine
+ *
+ * Returns the approximate cosine of an angle measured in radians.
+ *
+ * See also cos().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cos(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cos(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cos(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cos(half4 v);
+#endif
+
+/*
+ * native_cosh: Approximate hypebolic cosine
+ *
+ * Returns the approximate hypebolic cosine.
+ *
+ * See also cosh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cosh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cosh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cosh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cosh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cosh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cosh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cosh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cosh(half4 v);
+#endif
+
+/*
+ * native_cospi: Approximate cosine of a number multiplied by pi
+ *
+ * Returns the approximate cosine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the cosine of a value measured in degrees, call cospi(v / 180.f).
+ *
+ * See also cospi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_cospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_cospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_cospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_cospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_cospi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_cospi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_cospi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_cospi(half4 v);
+#endif
+
+/*
+ * native_divide: Approximate division
+ *
+ * Computes the approximate division of two values.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_divide(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_divide(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_divide(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_divide(float4 left_vector, float4 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_divide(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_divide(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_divide(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_divide(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * native_exp: Approximate e raised to a number
+ *
+ * Fast approximate exp.
+ *
+ * It is valid for inputs from -86.f to 86.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_exp(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_exp(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_exp(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_exp(half4 v);
+#endif
+
+/*
+ * native_exp10: Approximate 10 raised to a number
+ *
+ * Fast approximate exp10.
+ *
+ * It is valid for inputs from -37.f to 37.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp10().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_exp10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_exp10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_exp10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_exp10(half4 v);
+#endif
+
+/*
+ * native_exp2: Approximate 2 raised to a number
+ *
+ * Fast approximate exp2.
+ *
+ * It is valid for inputs from -125.f to 125.f.  The precision is no worse than what would be
+ * expected from using 16 bit floating point values.
+ *
+ * See also exp2().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_exp2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_exp2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_exp2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_exp2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_exp2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_exp2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_exp2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_exp2(half4 v);
+#endif
+
+/*
+ * native_expm1: Approximate e raised to a number minus one
+ *
+ * Returns the approximate (e ^ v) - 1.
+ *
+ * See also expm1().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_expm1(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_expm1(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_expm1(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_expm1(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_expm1(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_expm1(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_expm1(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_expm1(half4 v);
+#endif
+
+/*
+ * native_hypot: Approximate hypotenuse
+ *
+ * Returns the approximate native_sqrt(a * a + b * b)
+ *
+ * See also hypot().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_hypot(float a, float b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_hypot(float2 a, float2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_hypot(float3 a, float3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_hypot(float4 a, float4 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_hypot(half a, half b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_hypot(half2 a, half2 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_hypot(half3 a, half3 b);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_hypot(half4 a, half4 b);
+#endif
+
+/*
+ * native_log: Approximate natural logarithm
+ *
+ * Fast approximate log.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log(half4 v);
+#endif
+
+/*
+ * native_log10: Approximate base 10 logarithm
+ *
+ * Fast approximate log10.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log10().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log10(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log10(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log10(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log10(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log10(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log10(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log10(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log10(half4 v);
+#endif
+
+/*
+ * native_log1p: Approximate natural logarithm of a value plus 1
+ *
+ * Returns the approximate natural logarithm of (v + 1.0f)
+ *
+ * See also log1p().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_log1p(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_log1p(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_log1p(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_log1p(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log1p(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log1p(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log1p(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log1p(half4 v);
+#endif
+
+/*
+ * native_log2: Approximate base 2 logarithm
+ *
+ * Fast approximate log2.
+ *
+ * It is not accurate for values very close to zero.
+ *
+ * See also log2().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_log2(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_log2(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_log2(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_log2(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_log2(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_log2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_log2(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_log2(half4 v);
+#endif
+
+/*
+ * native_powr: Approximate positive base raised to an exponent
+ *
+ * Fast approximate (base ^ exponent).
+ *
+ * See also powr().
+ *
+ * Parameters:
+ *   base: Must be between 0.f and 256.f.  The function is not accurate for values very close to zero.
+ *   exponent: Must be between -15.f and 15.f.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float __attribute__((const, overloadable))
+    native_powr(float base, float exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float2 __attribute__((const, overloadable))
+    native_powr(float2 base, float2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float3 __attribute__((const, overloadable))
+    native_powr(float3 base, float3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 18))
+extern float4 __attribute__((const, overloadable))
+    native_powr(float4 base, float4 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_powr(half base, half exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_powr(half2 base, half2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_powr(half3 base, half3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_powr(half4 base, half4 exponent);
+#endif
+
+/*
+ * native_recip: Approximate reciprocal
+ *
+ * Returns the approximate approximate reciprocal of a value.
+ *
+ * See also half_recip().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_recip(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_recip(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_recip(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_recip(half4 v);
+#endif
+
+/*
+ * native_rootn: Approximate nth root
+ *
+ * Compute the approximate Nth root of a value.
+ *
+ * See also rootn().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_rootn(half v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_rootn(half2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_rootn(half3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_rootn(half4 v, int4 n);
+#endif
+
+/*
+ * native_rsqrt: Approximate reciprocal of a square root
+ *
+ * Returns approximate (1 / sqrt(v)).
+ *
+ * See also rsqrt(), half_rsqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_rsqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_rsqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_rsqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_rsqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_rsqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_rsqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_rsqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_rsqrt(half4 v);
+#endif
+
+/*
+ * native_sin: Approximate sine
+ *
+ * Returns the approximate sine of an angle measured in radians.
+ *
+ * See also sin().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sin(half4 v);
+#endif
+
+/*
+ * native_sincos: Approximate sine and cosine
+ *
+ * Returns the approximate sine and cosine of a value.
+ *
+ * See also sincos().
+ *
+ * Parameters:
+ *   v: Incoming value in radians.
+ *   cos: *cos will be set to the cosine value.
+ *
+ * Returns: Sine.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((overloadable))
+    native_sincos(float v, float* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((overloadable))
+    native_sincos(float2 v, float2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((overloadable))
+    native_sincos(float3 v, float3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((overloadable))
+    native_sincos(float4 v, float4* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    native_sincos(half v, half* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    native_sincos(half2 v, half2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    native_sincos(half3 v, half3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    native_sincos(half4 v, half4* cos);
+#endif
+
+/*
+ * native_sinh: Approximate hyperbolic sine
+ *
+ * Returns the approximate hyperbolic sine of a value specified in radians.
+ *
+ * See also sinh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sinh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sinh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sinh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sinh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sinh(half4 v);
+#endif
+
+/*
+ * native_sinpi: Approximate sine of a number multiplied by pi
+ *
+ * Returns the approximate sine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the sine of a value measured in degrees, call sinpi(v / 180.f).
+ *
+ * See also sinpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sinpi(half4 v);
+#endif
+
+/*
+ * native_sqrt: Approximate square root
+ *
+ * Returns the approximate sqrt(v).
+ *
+ * See also sqrt(), half_sqrt().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_sqrt(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_sqrt(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_sqrt(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_sqrt(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_sqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_sqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_sqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_sqrt(half4 v);
+#endif
+
+/*
+ * native_tan: Approximate tangent
+ *
+ * Returns the approximate tangent of an angle measured in radians.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_tan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_tan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_tan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_tan(half4 v);
+#endif
+
+/*
+ * native_tanh: Approximate hyperbolic tangent
+ *
+ * Returns the approximate hyperbolic tangent of a value.
+ *
+ * See also tanh().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tanh(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tanh(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tanh(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tanh(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_tanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_tanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_tanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_tanh(half4 v);
+#endif
+
+/*
+ * native_tanpi: Approximate tangent of a number multiplied by pi
+ *
+ * Returns the approximate tangent of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the tangent of a value measured in degrees, call tanpi(v / 180.f).
+ *
+ * See also tanpi().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_tanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_tanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_tanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_tanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_tanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_tanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_tanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_tanpi(half4 v);
+#endif
+
+/*
+ * nextafter: Next floating point number
+ *
+ * Returns the next representable floating point number from v towards target.
+ *
+ * In rs_fp_relaxed mode, a denormalized input value may not yield the next denormalized
+ * value, as support of denormalized values is optional in relaxed mode.
+ */
+extern float __attribute__((const, overloadable))
+    nextafter(float v, float target);
+
+extern float2 __attribute__((const, overloadable))
+    nextafter(float2 v, float2 target);
+
+extern float3 __attribute__((const, overloadable))
+    nextafter(float3 v, float3 target);
+
+extern float4 __attribute__((const, overloadable))
+    nextafter(float4 v, float4 target);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    nextafter(half v, half target);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    nextafter(half2 v, half2 target);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    nextafter(half3 v, half3 target);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    nextafter(half4 v, half4 target);
+#endif
+
+/*
+ * pow: Base raised to an exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.
+ *
+ * pown() and powr() are similar.  pown() takes an integer exponent. powr() assumes the
+ * base to be non-negative.
+ */
+extern float __attribute__((const, overloadable))
+    pow(float base, float exponent);
+
+extern float2 __attribute__((const, overloadable))
+    pow(float2 base, float2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    pow(float3 base, float3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    pow(float4 base, float4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    pow(half base, half exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    pow(half2 base, half2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    pow(half3 base, half3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    pow(half4 base, half4 exponent);
+#endif
+
+/*
+ * pown: Base raised to an integer exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.
+ *
+ * pow() and powr() are similar.  The both take a float exponent. powr() also assumes the
+ * base to be non-negative.
+ */
+extern float __attribute__((const, overloadable))
+    pown(float base, int exponent);
+
+extern float2 __attribute__((const, overloadable))
+    pown(float2 base, int2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    pown(float3 base, int3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    pown(float4 base, int4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    pown(half base, int exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    pown(half2 base, int2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    pown(half3 base, int3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    pown(half4 base, int4 exponent);
+#endif
+
+/*
+ * powr: Positive base raised to an exponent
+ *
+ * Returns base raised to the power exponent, i.e. base ^ exponent.  base must be >= 0.
+ *
+ * pow() and pown() are similar.  They both make no assumptions about the base.
+ * pow() takes a float exponent while pown() take an integer.
+ *
+ * See also native_powr().
+ */
+extern float __attribute__((const, overloadable))
+    powr(float base, float exponent);
+
+extern float2 __attribute__((const, overloadable))
+    powr(float2 base, float2 exponent);
+
+extern float3 __attribute__((const, overloadable))
+    powr(float3 base, float3 exponent);
+
+extern float4 __attribute__((const, overloadable))
+    powr(float4 base, float4 exponent);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    powr(half base, half exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    powr(half2 base, half2 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    powr(half3 base, half3 exponent);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    powr(half4 base, half4 exponent);
+#endif
+
+/*
+ * radians: Converts degrees into radians
+ *
+ * Converts from degrees to radians.
+ */
+extern float __attribute__((const, overloadable))
+    radians(float v);
+
+extern float2 __attribute__((const, overloadable))
+    radians(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    radians(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    radians(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    radians(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    radians(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    radians(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    radians(half4 v);
+#endif
+
+/*
+ * remainder: Remainder of a division
+ *
+ * Returns the remainder of (numerator / denominator), where the quotient is rounded towards
+ * the nearest integer.
+ *
+ * The function fmod() is similar but rounds toward the closest interger.
+ * For example, fmod(-3.8f, 2.f) returns -1.8f (-3.8f - -1.f * 2.f)
+ * while remainder(-3.8f, 2.f) returns 0.2f (-3.8f - -2.f * 2.f).
+ */
+extern float __attribute__((const, overloadable))
+    remainder(float numerator, float denominator);
+
+extern float2 __attribute__((const, overloadable))
+    remainder(float2 numerator, float2 denominator);
+
+extern float3 __attribute__((const, overloadable))
+    remainder(float3 numerator, float3 denominator);
+
+extern float4 __attribute__((const, overloadable))
+    remainder(float4 numerator, float4 denominator);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    remainder(half numerator, half denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    remainder(half2 numerator, half2 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    remainder(half3 numerator, half3 denominator);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    remainder(half4 numerator, half4 denominator);
+#endif
+
+/*
+ * remquo: Remainder and quotient of a division
+ *
+ * Returns the quotient and the remainder of (numerator / denominator).
+ *
+ * Only the sign and lowest three bits of the quotient are guaranteed to be accurate.
+ *
+ * This function is useful for implementing periodic functions.  The low three bits of the
+ * quotient gives the quadrant and the remainder the distance within the quadrant.
+ * For example, an implementation of sin(x) could call remquo(x, PI / 2.f, &quadrant)
+ * to reduce very large value of x to something within a limited range.
+ *
+ * Example: remquo(-23.5f, 8.f, &quot) sets the lowest three bits of quot to 3
+ * and the sign negative.  It returns 0.5f.
+ *
+ * Parameters:
+ *   numerator: Numerator.
+ *   denominator: Denominator.
+ *   quotient: *quotient will be set to the integer quotient.
+ *
+ * Returns: Remainder, precise only for the low three bits.
+ */
+extern float __attribute__((overloadable))
+    remquo(float numerator, float denominator, int* quotient);
+
+extern float2 __attribute__((overloadable))
+    remquo(float2 numerator, float2 denominator, int2* quotient);
+
+extern float3 __attribute__((overloadable))
+    remquo(float3 numerator, float3 denominator, int3* quotient);
+
+extern float4 __attribute__((overloadable))
+    remquo(float4 numerator, float4 denominator, int4* quotient);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    remquo(half numerator, half denominator, int* quotient);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    remquo(half2 numerator, half2 denominator, int2* quotient);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    remquo(half3 numerator, half3 denominator, int3* quotient);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    remquo(half4 numerator, half4 denominator, int4* quotient);
+#endif
+
+/*
+ * rint: Round to even
+ *
+ * Rounds to the nearest integral value.
+ *
+ * rint() rounds half values to even.  For example, rint(0.5f) returns 0.f and
+ * rint(1.5f) returns 2.f.  Similarly, rint(-0.5f) returns -0.f and
+ * rint(-1.5f) returns -2.f.
+ *
+ * round() is similar but rounds away from zero.  trunc() truncates the decimal fraction.
+ */
+extern float __attribute__((const, overloadable))
+    rint(float v);
+
+extern float2 __attribute__((const, overloadable))
+    rint(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    rint(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    rint(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    rint(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    rint(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    rint(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    rint(half4 v);
+#endif
+
+/*
+ * rootn: Nth root
+ *
+ * Compute the Nth root of a value.
+ *
+ * See also native_rootn().
+ */
+extern float __attribute__((const, overloadable))
+    rootn(float v, int n);
+
+extern float2 __attribute__((const, overloadable))
+    rootn(float2 v, int2 n);
+
+extern float3 __attribute__((const, overloadable))
+    rootn(float3 v, int3 n);
+
+extern float4 __attribute__((const, overloadable))
+    rootn(float4 v, int4 n);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    rootn(half v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    rootn(half2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    rootn(half3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    rootn(half4 v, int4 n);
+#endif
+
+/*
+ * round: Round away from zero
+ *
+ * Round to the nearest integral value.
+ *
+ * round() rounds half values away from zero.  For example, round(0.5f) returns 1.f
+ * and round(1.5f) returns 2.f.  Similarly, round(-0.5f) returns -1.f
+ * and round(-1.5f) returns -2.f.
+ *
+ * rint() is similar but rounds half values toward even.  trunc() truncates the decimal fraction.
+ */
+extern float __attribute__((const, overloadable))
+    round(float v);
+
+extern float2 __attribute__((const, overloadable))
+    round(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    round(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    round(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    round(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    round(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    round(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    round(half4 v);
+#endif
+
+/*
+ * rsqrt: Reciprocal of a square root
+ *
+ * Returns (1 / sqrt(v)).
+ *
+ * See also half_rsqrt(), native_rsqrt().
+ */
+extern float __attribute__((const, overloadable))
+    rsqrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    rsqrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    rsqrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    rsqrt(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    rsqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    rsqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    rsqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    rsqrt(half4 v);
+#endif
+
+/*
+ * sign: Sign of a value
+ *
+ * Returns the sign of a value.
+ *
+ * if (v < 0) return -1.f;
+ * else if (v > 0) return 1.f;
+ * else return 0.f;
+ */
+extern float __attribute__((const, overloadable))
+    sign(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sign(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sign(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sign(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sign(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sign(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sign(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sign(half4 v);
+#endif
+
+/*
+ * sin: Sine
+ *
+ * Returns the sine of an angle measured in radians.
+ *
+ * See also native_sin().
+ */
+extern float __attribute__((const, overloadable))
+    sin(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sin(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sin(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sin(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sin(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sin(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sin(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sin(half4 v);
+#endif
+
+/*
+ * sincos: Sine and cosine
+ *
+ * Returns the sine and cosine of a value.
+ *
+ * See also native_sincos().
+ *
+ * Parameters:
+ *   v: Incoming value in radians.
+ *   cos: *cos will be set to the cosine value.
+ *
+ * Returns: Sine of v.
+ */
+extern float __attribute__((overloadable))
+    sincos(float v, float* cos);
+
+extern float2 __attribute__((overloadable))
+    sincos(float2 v, float2* cos);
+
+extern float3 __attribute__((overloadable))
+    sincos(float3 v, float3* cos);
+
+extern float4 __attribute__((overloadable))
+    sincos(float4 v, float4* cos);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((overloadable))
+    sincos(half v, half* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((overloadable))
+    sincos(half2 v, half2* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((overloadable))
+    sincos(half3 v, half3* cos);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((overloadable))
+    sincos(half4 v, half4* cos);
+#endif
+
+/*
+ * sinh: Hyperbolic sine
+ *
+ * Returns the hyperbolic sine of v, where v is measured in radians.
+ *
+ * See also native_sinh().
+ */
+extern float __attribute__((const, overloadable))
+    sinh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sinh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sinh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sinh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sinh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sinh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sinh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sinh(half4 v);
+#endif
+
+/*
+ * sinpi: Sine of a number multiplied by pi
+ *
+ * Returns the sine of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the sine of a value measured in degrees, call sinpi(v / 180.f).
+ *
+ * See also native_sinpi().
+ */
+extern float __attribute__((const, overloadable))
+    sinpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sinpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sinpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sinpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sinpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sinpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sinpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sinpi(half4 v);
+#endif
+
+/*
+ * sqrt: Square root
+ *
+ * Returns the square root of a value.
+ *
+ * See also half_sqrt(), native_sqrt().
+ */
+extern float __attribute__((const, overloadable))
+    sqrt(float v);
+
+extern float2 __attribute__((const, overloadable))
+    sqrt(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    sqrt(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    sqrt(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    sqrt(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    sqrt(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    sqrt(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    sqrt(half4 v);
+#endif
+
+/*
+ * step: 0 if less than a value, 0 otherwise
+ *
+ * Returns 0.f if v < edge, 1.f otherwise.
+ *
+ * This can be useful to create conditional computations without using loops and branching
+ * instructions.  For example, instead of computing (a[i] < b[i]) ? 0.f : atan2(a[i], b[i])
+ * for the corresponding elements of a vector, you could instead use step(a, b) * atan2(a, b).
+ */
+extern float __attribute__((const, overloadable))
+    step(float edge, float v);
+
+extern float2 __attribute__((const, overloadable))
+    step(float2 edge, float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    step(float3 edge, float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    step(float4 edge, float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    step(half edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    step(half2 edge, half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    step(half3 edge, half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    step(half4 edge, half4 v);
+#endif
+
+extern float2 __attribute__((const, overloadable))
+    step(float2 edge, float v);
+
+extern float3 __attribute__((const, overloadable))
+    step(float3 edge, float v);
+
+extern float4 __attribute__((const, overloadable))
+    step(float4 edge, float v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    step(half2 edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    step(half3 edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    step(half4 edge, half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    step(float edge, float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    step(float edge, float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    step(float edge, float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    step(half edge, half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    step(half edge, half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    step(half edge, half4 v);
+#endif
+
+/*
+ * tan: Tangent
+ *
+ * Returns the tangent of an angle measured in radians.
+ *
+ * See also native_tan().
+ */
+extern float __attribute__((const, overloadable))
+    tan(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tan(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tan(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tan(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tan(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tan(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tan(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tan(half4 v);
+#endif
+
+/*
+ * tanh: Hyperbolic tangent
+ *
+ * Returns the hyperbolic tangent of a value.
+ *
+ * See also native_tanh().
+ */
+extern float __attribute__((const, overloadable))
+    tanh(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tanh(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tanh(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tanh(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tanh(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tanh(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tanh(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tanh(half4 v);
+#endif
+
+/*
+ * tanpi: Tangent of a number multiplied by pi
+ *
+ * Returns the tangent of (v * pi), where (v * pi) is measured in radians.
+ *
+ * To get the tangent of a value measured in degrees, call tanpi(v / 180.f).
+ *
+ * See also native_tanpi().
+ */
+extern float __attribute__((const, overloadable))
+    tanpi(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tanpi(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tanpi(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tanpi(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tanpi(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tanpi(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tanpi(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tanpi(half4 v);
+#endif
+
+/*
+ * tgamma: Gamma function
+ *
+ * Returns the gamma function of a value.
+ *
+ * See also lgamma().
+ */
+extern float __attribute__((const, overloadable))
+    tgamma(float v);
+
+extern float2 __attribute__((const, overloadable))
+    tgamma(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    tgamma(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    tgamma(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    tgamma(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    tgamma(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    tgamma(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    tgamma(half4 v);
+#endif
+
+/*
+ * trunc: Truncates a floating point
+ *
+ * Rounds to integral using truncation.
+ *
+ * For example, trunc(1.7f) returns 1.f and trunc(-1.7f) returns -1.f.
+ *
+ * See rint() and round() for other rounding options.
+ */
+extern float __attribute__((const, overloadable))
+    trunc(float v);
+
+extern float2 __attribute__((const, overloadable))
+    trunc(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    trunc(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    trunc(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    trunc(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    trunc(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    trunc(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    trunc(half4 v);
+#endif
+
+/*
+ * rsClamp: Restrain a value to a range
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Clamp a value between low and high.
+ *
+ * Parameters:
+ *   amount: Value to clamp.
+ *   low: Lower bound.
+ *   high: Upper bound.
+ */
+extern char __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(char amount, char low, char high);
+
+extern uchar __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(uchar amount, uchar low, uchar high);
+
+extern short __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(short amount, short low, short high);
+
+extern ushort __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(ushort amount, ushort low, ushort high);
+
+extern int __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(int amount, int low, int high);
+
+extern uint __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use clamp() instead.")
+#endif
+))
+    rsClamp(uint amount, uint low, uint high);
+
+/*
+ * rsFrac: Returns the fractional part of a float
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the fractional part of a float
+ */
+extern float __attribute__((const, overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("Use fract() instead.")
+#endif
+))
+    rsFrac(float v);
+
+/*
+ * rsRand: Pseudo-random number
+ *
+ * Return a random value between 0 (or min_value) and max_malue.
+ */
+extern int __attribute__((overloadable))
+    rsRand(int max_value);
+
+extern int __attribute__((overloadable))
+    rsRand(int min_value, int max_value);
+
+extern float __attribute__((overloadable))
+    rsRand(float max_value);
+
+extern float __attribute__((overloadable))
+    rsRand(float min_value, float max_value);
+
+#endif // RENDERSCRIPT_RS_MATH_RSH

diff --git a/25.0.2/include/rs_matrix.rsh b/25.0.2/include/rs_matrix.rsh
new file mode 100644
index 0000000..9cdc27f
--- /dev/null
+++ b/25.0.2/include/rs_matrix.rsh

@@ -0,0 +1,612 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_matrix.rsh: Matrix Functions
+ *
+ * These functions let you manipulate square matrices of rank 2x2, 3x3, and 4x4.
+ * They are particularly useful for graphical transformations and are compatible
+ * with OpenGL.
+ *
+ * We use a zero-based index for rows and columns.  E.g. the last element of a
+ * rs_matrix4x4 is found at (3, 3).
+ *
+ * RenderScript uses column-major matrices and column-based vectors.  Transforming
+ * a vector is done by postmultiplying the vector, e.g. (matrix * vector),
+ * as provided by rsMatrixMultiply().
+ *
+ * To create a transformation matrix that performs two transformations at once,
+ * multiply the two source matrices, with the first transformation as the right
+ * argument.  E.g. to create a transformation matrix that applies the
+ * transformation s1 followed by s2, call rsMatrixLoadMultiply(&combined, &s2, &s1).
+ * This derives from s2 * (s1 * v), which is (s2 * s1) * v.
+ *
+ * We have two style of functions to create transformation matrices:
+ * rsMatrixLoadTransformation and rsMatrixTransformation.  The former
+ * style simply stores the transformation matrix in the first argument.  The latter
+ * modifies a pre-existing transformation matrix so that the new transformation
+ * happens first.  E.g. if you call rsMatrixTranslate() on a matrix that already
+ * does a scaling, the resulting matrix when applied to a vector will first do the
+ * translation then the scaling.
+ */
+
+#ifndef RENDERSCRIPT_RS_MATRIX_RSH
+#define RENDERSCRIPT_RS_MATRIX_RSH
+
+#include "rs_vector_math.rsh"
+
+/*
+ * rsExtractFrustumPlanes: Compute frustum planes
+ *
+ * Computes 6 frustum planes from the view projection matrix
+ *
+ * Parameters:
+ *   viewProj: Matrix to extract planes from.
+ *   left: Left plane.
+ *   right: Right plane.
+ *   top: Top plane.
+ *   bottom: Bottom plane.
+ *   near: Near plane.
+ *   far: Far plane.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsExtractFrustumPlanes(const rs_matrix4x4* viewProj, float4* left, float4* right, float4* top,
+                           float4* bottom, float4* near, float4* far) {
+    // x y z w = a b c d in the plane equation
+    left->x = viewProj->m[3] + viewProj->m[0];
+    left->y = viewProj->m[7] + viewProj->m[4];
+    left->z = viewProj->m[11] + viewProj->m[8];
+    left->w = viewProj->m[15] + viewProj->m[12];
+
+    right->x = viewProj->m[3] - viewProj->m[0];
+    right->y = viewProj->m[7] - viewProj->m[4];
+    right->z = viewProj->m[11] - viewProj->m[8];
+    right->w = viewProj->m[15] - viewProj->m[12];
+
+    top->x = viewProj->m[3] - viewProj->m[1];
+    top->y = viewProj->m[7] - viewProj->m[5];
+    top->z = viewProj->m[11] - viewProj->m[9];
+    top->w = viewProj->m[15] - viewProj->m[13];
+
+    bottom->x = viewProj->m[3] + viewProj->m[1];
+    bottom->y = viewProj->m[7] + viewProj->m[5];
+    bottom->z = viewProj->m[11] + viewProj->m[9];
+    bottom->w = viewProj->m[15] + viewProj->m[13];
+
+    near->x = viewProj->m[3] + viewProj->m[2];
+    near->y = viewProj->m[7] + viewProj->m[6];
+    near->z = viewProj->m[11] + viewProj->m[10];
+    near->w = viewProj->m[15] + viewProj->m[14];
+
+    far->x = viewProj->m[3] - viewProj->m[2];
+    far->y = viewProj->m[7] - viewProj->m[6];
+    far->z = viewProj->m[11] - viewProj->m[10];
+    far->w = viewProj->m[15] - viewProj->m[14];
+
+    float len = length(left->xyz);
+    *left /= len;
+    len = length(right->xyz);
+    *right /= len;
+    len = length(top->xyz);
+    *top /= len;
+    len = length(bottom->xyz);
+    *bottom /= len;
+    len = length(near->xyz);
+    *near /= len;
+    len = length(far->xyz);
+    *far /= len;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsExtractFrustumPlanes(const rs_matrix4x4* viewProj, float4* left, float4* righ, float4* top,
+                           float4* bottom, float4* near, float4* far);
+#endif
+
+/*
+ * rsIsSphereInFrustum: Checks if a sphere is within the frustum planes
+ *
+ * Returns true if the sphere is within the 6 frustum planes.
+ *
+ * Parameters:
+ *   sphere: float4 representing the sphere.
+ *   left: Left plane.
+ *   right: Right plane.
+ *   top: Top plane.
+ *   bottom: Bottom plane.
+ *   near: Near plane.
+ *   far: Far plane.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline bool __attribute__((always_inline, overloadable))
+    rsIsSphereInFrustum(float4* sphere, float4* left, float4* right, float4* top, float4* bottom,
+                        float4* near, float4* far) {
+    float distToCenter = dot(left->xyz, sphere->xyz) + left->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(right->xyz, sphere->xyz) + right->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(top->xyz, sphere->xyz) + top->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(bottom->xyz, sphere->xyz) + bottom->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(near->xyz, sphere->xyz) + near->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    distToCenter = dot(far->xyz, sphere->xyz) + far->w;
+    if (distToCenter < -sphere->w) {
+        return false;
+    }
+    return true;
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern bool __attribute__((overloadable))
+    rsIsSphereInFrustum(float4* sphere, float4* left, float4* right, float4* top, float4* bottom,
+                        float4* near, float4* far);
+#endif
+
+/*
+ * rsMatrixGet: Get one element
+ *
+ * Returns one element of a matrix.
+ *
+ * Warning: The order of the column and row parameters may be unexpected.
+ *
+ * Parameters:
+ *   m: Matrix to extract the element from.
+ *   col: Zero-based column of the element to be extracted.
+ *   row: Zero-based row of the element to extracted.
+ */
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix4x4* m, uint32_t col, uint32_t row);
+
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix3x3* m, uint32_t col, uint32_t row);
+
+extern float __attribute__((overloadable))
+    rsMatrixGet(const rs_matrix2x2* m, uint32_t col, uint32_t row);
+
+/*
+ * rsMatrixInverse: Inverts a matrix in place
+ *
+ * Returns true if the matrix was successfully inverted.
+ *
+ * Parameters:
+ *   m: Matrix to invert.
+ */
+extern bool __attribute__((overloadable))
+    rsMatrixInverse(rs_matrix4x4* m);
+
+/*
+ * rsMatrixInverseTranspose: Inverts and transpose a matrix in place
+ *
+ * The matrix is first inverted then transposed. Returns true if the matrix was
+ * successfully inverted.
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ */
+extern bool __attribute__((overloadable))
+    rsMatrixInverseTranspose(rs_matrix4x4* m);
+
+/*
+ * rsMatrixLoad: Load or copy a matrix
+ *
+ * Set the elements of a matrix from an array of floats or from another matrix.
+ *
+ * If loading from an array, the floats should be in row-major order, i.e. the element a
+ * row 0, column 0 should be first, followed by the element at
+ * row 0, column 1, etc.
+ *
+ * If loading from a matrix and the source is smaller than the destination, the rest
+ * of the destination is filled with elements of the identity matrix.  E.g.
+ * loading a rs_matrix2x2 into a rs_matrix4x4 will give:
+ *
+ * m00 m01 0.0 0.0
+ * m10 m11 0.0 0.0
+ * 0.0 0.0 1.0 0.0
+ * 0.0 0.0 0.0 1.0
+ *
+ *
+ * Parameters:
+ *   destination: Matrix to set.
+ *   array: Array of values to set the matrix to. These arrays should be 4, 9, or 16 floats long, depending on the matrix size.
+ *   source: Source matrix.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix3x3* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix2x2* destination, const float* array);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix4x4* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix3x3* destination, const rs_matrix3x3* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix2x2* destination, const rs_matrix2x2* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix3x3* source);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoad(rs_matrix4x4* destination, const rs_matrix2x2* source);
+
+/*
+ * rsMatrixLoadFrustum: Load a frustum projection matrix
+ *
+ * Constructs a frustum projection matrix, transforming the box identified by
+ * the six clipping planes left, right, bottom, top, near, far.
+ *
+ * To apply this projection to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadFrustum(rs_matrix4x4* m, float left, float right, float bottom, float top,
+                        float near, float far);
+
+/*
+ * rsMatrixLoadIdentity: Load identity matrix
+ *
+ * Set the elements of a matrix to the identity matrix.
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix4x4* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix3x3* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadIdentity(rs_matrix2x2* m);
+
+/*
+ * rsMatrixLoadMultiply: Multiply two matrices
+ *
+ * Sets m to the matrix product of lhs * rhs.
+ *
+ * To combine two 4x4 transformaton matrices, multiply the second transformation matrix
+ * by the first transformation matrix.  E.g. to create a transformation matrix that applies
+ * the transformation s1 followed by s2, call rsMatrixLoadMultiply(&combined, &s2, &s1).
+ *
+ * Warning: Prior to version 21, storing the result back into right matrix is not supported and
+ * will result in undefined behavior.  Use rsMatrixMulitply instead.   E.g. instead of doing
+ * rsMatrixLoadMultiply (&m2r, &m2r, &m2l), use rsMatrixMultiply (&m2r, &m2l).
+ * rsMatrixLoadMultiply (&m2l, &m2r, &m2l) works as expected.
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   lhs: Left matrix of the product.
+ *   rhs: Right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix4x4* m, const rs_matrix4x4* lhs, const rs_matrix4x4* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix3x3* m, const rs_matrix3x3* lhs, const rs_matrix3x3* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixLoadMultiply(rs_matrix2x2* m, const rs_matrix2x2* lhs, const rs_matrix2x2* rhs);
+
+/*
+ * rsMatrixLoadOrtho: Load an orthographic projection matrix
+ *
+ * Constructs an orthographic projection matrix, transforming the box identified by the
+ * six clipping planes left, right, bottom, top, near, far into a unit cube
+ * with a corner at (-1, -1, -1) and the opposite at (1, 1, 1).
+ *
+ * To apply this projection to a vector, multiply the vector by the created matrix
+ * using rsMatrixMultiply().
+ *
+ * See https://en.wikipedia.org/wiki/Orthographic_projection .
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadOrtho(rs_matrix4x4* m, float left, float right, float bottom, float top, float near,
+                      float far);
+
+/*
+ * rsMatrixLoadPerspective: Load a perspective projection matrix
+ *
+ * Constructs a perspective projection matrix, assuming a symmetrical field of view.
+ *
+ * To apply this projection to a vector, multiply the vector by the created matrix
+ * using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   fovy: Field of view, in degrees along the Y axis.
+ *   aspect: Ratio of x / y.
+ *   near: Near clipping plane.
+ *   far: Far clipping plane.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+/*
+ * rsMatrixLoadRotate: Load a rotation matrix
+ *
+ * This function creates a rotation matrix.  The axis of rotation is the (x, y, z) vector.
+ *
+ * To rotate a vector, multiply the vector by the created matrix using rsMatrixMultiply().
+ *
+ * See http://en.wikipedia.org/wiki/Rotation_matrix .
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   rot: How much rotation to do, in degrees.
+ *   x: X component of the vector that is the axis of rotation.
+ *   y: Y component of the vector that is the axis of rotation.
+ *   z: Z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+/*
+ * rsMatrixLoadScale: Load a scaling matrix
+ *
+ * This function creates a scaling matrix, where each component of a vector is multiplied
+ * by a number.  This number can be negative.
+ *
+ * To scale a vector, multiply the vector by the created matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   x: Multiple to scale the x components by.
+ *   y: Multiple to scale the y components by.
+ *   z: Multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadScale(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixLoadTranslate: Load a translation matrix
+ *
+ * This function creates a translation matrix, where a number is added to each element of
+ * a vector.
+ *
+ * To translate a vector, multiply the vector by the created matrix using
+ * rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to set.
+ *   x: Number to add to each x component.
+ *   y: Number to add to each y component.
+ *   z: Number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixLoadTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixMultiply: Multiply a matrix by a vector or another matrix
+ *
+ * For the matrix by matrix variant, sets m to the matrix product m * rhs.
+ *
+ * When combining two 4x4 transformation matrices using this function, the resulting
+ * matrix will correspond to performing the rhs transformation first followed by
+ * the original m transformation.
+ *
+ * For the matrix by vector variant, returns the post-multiplication of the vector
+ * by the matrix, ie. m * in.
+ *
+ * When multiplying a float3 to a rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a float2 to a rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a float2 to a rs_matrix3x3, the vector is expanded with (0).
+ *
+ * Starting with API 14, this function takes a const matrix as the first argument.
+ *
+ * Parameters:
+ *   m: Left matrix of the product and the matrix to be set.
+ *   rhs: Right matrix of the product.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, const rs_matrix4x4* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, const rs_matrix3x3* rhs);
+
+extern void __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix2x2* m, const rs_matrix2x2* rhs);
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float4 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float3 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix4x4* m, float2 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, float3 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix3x3* m, float2 in);
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 13)
+extern float2 __attribute__((overloadable))
+    rsMatrixMultiply(rs_matrix2x2* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float4 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float4 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix4x4* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix3x3* m, float3 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float3 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix3x3* m, float2 in);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+extern float2 __attribute__((overloadable))
+    rsMatrixMultiply(const rs_matrix2x2* m, float2 in);
+#endif
+
+/*
+ * rsMatrixRotate: Apply a rotation to a transformation matrix
+ *
+ * Multiply the matrix m with a rotation matrix.
+ *
+ * This function modifies a transformation matrix to first do a rotation.  The axis of
+ * rotation is the (x, y, z) vector.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   rot: How much rotation to do, in degrees.
+ *   x: X component of the vector that is the axis of rotation.
+ *   y: Y component of the vector that is the axis of rotation.
+ *   z: Z component of the vector that is the axis of rotation.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+/*
+ * rsMatrixScale: Apply a scaling to a transformation matrix
+ *
+ * Multiply the matrix m with a scaling matrix.
+ *
+ * This function modifies a transformation matrix to first do a scaling.   When scaling,
+ * each component of a vector is multiplied by a number.  This number can be negative.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the created
+ * matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   x: Multiple to scale the x components by.
+ *   y: Multiple to scale the y components by.
+ *   z: Multiple to scale the z components by.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixScale(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixSet: Set one element
+ *
+ * Set an element of a matrix.
+ *
+ * Warning: The order of the column and row parameters may be unexpected.
+ *
+ * Parameters:
+ *   m: Matrix that will be modified.
+ *   col: Zero-based column of the element to be set.
+ *   row: Zero-based row of the element to be set.
+ *   v: Value to set.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix4x4* m, uint32_t col, uint32_t row, float v);
+
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix3x3* m, uint32_t col, uint32_t row, float v);
+
+extern void __attribute__((overloadable))
+    rsMatrixSet(rs_matrix2x2* m, uint32_t col, uint32_t row, float v);
+
+/*
+ * rsMatrixTranslate: Apply a translation to a transformation matrix
+ *
+ * Multiply the matrix m with a translation matrix.
+ *
+ * This function modifies a transformation matrix to first do a translation.  When
+ * translating, a number is added to each component of a vector.
+ *
+ * To apply this combined transformation to a vector, multiply the vector by the
+ * created matrix using rsMatrixMultiply().
+ *
+ * Parameters:
+ *   m: Matrix to modify.
+ *   x: Number to add to each x component.
+ *   y: Number to add to each y component.
+ *   z: Number to add to each z component.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+/*
+ * rsMatrixTranspose: Transpose a matrix place
+ *
+ * Transpose the matrix m in place.
+ *
+ * Parameters:
+ *   m: Matrix to transpose.
+ */
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix4x4* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix3x3* m);
+
+extern void __attribute__((overloadable))
+    rsMatrixTranspose(rs_matrix2x2* m);
+
+#endif // RENDERSCRIPT_RS_MATRIX_RSH

diff --git a/25.0.2/include/rs_object_info.rsh b/25.0.2/include/rs_object_info.rsh
new file mode 100644
index 0000000..0b18de3
--- /dev/null
+++ b/25.0.2/include/rs_object_info.rsh

@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_object_info.rsh: Object Characteristics Functions
+ *
+ * The functions below can be used to query the characteristics of an Allocation, Element,
+ * or Sampler object.  These objects are created from Java.  You can't create them from a
+ * script.
+ *
+ * Allocations:
+ *
+ * Allocations are the primary method used to pass data to and from RenderScript kernels.
+ *
+ * They are a structured collection of cells that can be used to store bitmaps, textures,
+ * arbitrary data points, etc.
+ *
+ * This collection of cells may have many dimensions (X, Y, Z, Array0, Array1, Array2, Array3),
+ * faces (for cubemaps), and level of details (for mipmapping).
+ *
+ * See the android.renderscript.Allocation for details on to create Allocations.
+ *
+ * Elements:
+ *
+ * The term "element" is used a bit ambiguously in RenderScript, as both type information
+ * for the cells of an Allocation and the instantiation of that type.  For example:
+ * - rs_element is a handle to a type specification, and
+ * - In functions like rsGetElementAt(), "element" means the instantiation of the type,
+ *     i.e. a cell of an Allocation.
+ *
+ * The functions below let you query the characteristics of the type specificiation.
+ *
+ * An Element can specify a simple data types as found in C, e.g. an integer, float, or
+ * boolean.  It can also specify a handle to a RenderScript object.  See rs_data_type for
+ * a list of basic types.
+ *
+ * Elements can specify fixed size vector (of size 2, 3, or 4) versions of the basic types.
+ * Elements can be grouped together into complex Elements, creating the equivalent of
+ * C structure definitions.
+ *
+ * Elements can also have a kind, which is semantic information used to interpret pixel
+ * data.  See rs_data_kind.
+ *
+ * When creating Allocations of common elements, you can simply use one of the many predefined
+ * Elements like F32_2.
+ *
+ * To create complex Elements, use the Element.Builder Java class.
+ *
+ * Samplers:
+ *
+ * Samplers objects define how Allocations can be read as structure within a kernel.
+ * See android.renderscript.S.
+ */
+
+#ifndef RENDERSCRIPT_RS_OBJECT_INFO_RSH
+#define RENDERSCRIPT_RS_OBJECT_INFO_RSH
+
+/*
+ * rsAllocationGetDimFaces: Presence of more than one face
+ *
+ * If the Allocation is a cubemap, this function returns 1 if there's more than one face
+ * present.  In all other cases, it returns 0.
+ *
+ * Use rsGetDimHasFaces() to get the dimension of a currently running kernel.
+ *
+ * Returns: Returns 1 if more than one face is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimFaces(rs_allocation a);
+
+/*
+ * rsAllocationGetDimLOD: Presence of levels of detail
+ *
+ * Query an Allocation for the presence of more than one Level Of Detail.  This is useful
+ * for mipmaps.
+ *
+ * Use rsGetDimLod() to get the dimension of a currently running kernel.
+ *
+ * Returns: Returns 1 if more than one LOD is present, 0 otherwise.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimLOD(rs_allocation a);
+
+/*
+ * rsAllocationGetDimX: Size of the X dimension
+ *
+ * Returns the size of the X dimension of the Allocation.
+ *
+ * Use rsGetDimX() to get the dimension of a currently running kernel.
+ *
+ * Returns: X dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimX(rs_allocation a);
+
+/*
+ * rsAllocationGetDimY: Size of the Y dimension
+ *
+ * Returns the size of the Y dimension of the Allocation.  If the Allocation has less
+ * than two dimensions, returns 0.
+ *
+ * Use rsGetDimY() to get the dimension of a currently running kernel.
+ *
+ * Returns: Y dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimY(rs_allocation a);
+
+/*
+ * rsAllocationGetDimZ: Size of the Z dimension
+ *
+ * Returns the size of the Z dimension of the Allocation.  If the Allocation has less
+ * than three dimensions, returns 0.
+ *
+ * Use rsGetDimZ() to get the dimension of a currently running kernel.
+ *
+ * Returns: Z dimension of the Allocation.
+ */
+extern uint32_t __attribute__((overloadable))
+    rsAllocationGetDimZ(rs_allocation a);
+
+/*
+ * rsAllocationGetElement: Get the object that describes the cell of an Allocation
+ *
+ * Get the Element object describing the type, kind, and other characteristics of a cell
+ * of an Allocation.  See the rsElement* functions below.
+ *
+ * Parameters:
+ *   a: Allocation to get data from.
+ *
+ * Returns: Element describing Allocation layout.
+ */
+extern rs_element __attribute__((overloadable))
+    rsAllocationGetElement(rs_allocation a);
+
+/*
+ * rsClearObject: Release an object
+ *
+ * Tells the run time that this handle will no longer be used to access the the related
+ * object.  If this was the last handle to that object, resource recovery may happen.
+ *
+ * After calling this function, *dst will be set to an empty handle.  See rsIsObject().
+ */
+extern void __attribute__((overloadable))
+    rsClearObject(rs_element* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_type* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_allocation* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_sampler* dst);
+
+extern void __attribute__((overloadable))
+    rsClearObject(rs_script* dst);
+
+/*
+ * rsIsObject: Check for an empty handle
+ *
+ * Returns true if the handle contains a non-null reference.
+ *
+ * This function does not validate that the internal pointer used in the handle
+ * points to an actual valid object; it only checks for null.
+ *
+ * This function can be used to check the Element returned by rsElementGetSubElement()
+ * or see if rsClearObject() has been called on a handle.
+ */
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_element v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_type v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_allocation v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_sampler v);
+
+extern bool __attribute__((overloadable))
+    rsIsObject(rs_script v);
+
+/*
+ * rsElementGetBytesSize: Size of an Element
+ *
+ * Returns the size in bytes that an instantiation of this Element will occupy.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetBytesSize(rs_element e);
+#endif
+
+/*
+ * rsElementGetDataKind: Kind of an Element
+ *
+ * Returns the Element's data kind.  This is used to interpret pixel data.
+ *
+ * See rs_data_kind.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_data_kind __attribute__((overloadable))
+    rsElementGetDataKind(rs_element e);
+#endif
+
+/*
+ * rsElementGetDataType: Data type of an Element
+ *
+ * Returns the Element's base data type.  This can be a type similar to C/C++ (e.g.
+ * RS_TYPE_UNSIGNED_8), a handle (e.g. RS_TYPE_ALLOCATION and RS_TYPE_ELEMENT), or a
+ * more complex numerical type (e.g. RS_TYPE_UNSIGNED_5_6_5 and RS_TYPE_MATRIX_4X4).
+ * See rs_data_type.
+ *
+ * If the Element describes a vector, this function returns the data type of one of its items.
+ * Use rsElementGetVectorSize to get the size of the vector.
+ *
+ * If the Element describes a structure, RS_TYPE_NONE is returned.  Use the rsElementGetSub*
+ * functions to explore this complex Element.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_data_type __attribute__((overloadable))
+    rsElementGetDataType(rs_element e);
+#endif
+
+/*
+ * rsElementGetSubElement: Sub-element of a complex Element
+ *
+ * For Elements that represents a structure, this function returns the sub-element at the
+ * specified index.
+ *
+ * If the Element is not a structure or the index is greater or equal to the number of
+ * sub-elements, an invalid handle is returned.
+ *
+ * Parameters:
+ *   e: Element to query.
+ *   index: Index of the sub-element to return.
+ *
+ * Returns: Sub-element at the given index.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_element __attribute__((overloadable))
+    rsElementGetSubElement(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementArraySize: Array size of a sub-element of a complex Element
+ *
+ * For complex Elements, sub-elements can be statically sized arrays.  This function
+ * returns the array size of the sub-element at the index.  This sub-element repetition
+ * is different than fixed size vectors.
+ *
+ * Parameters:
+ *   e: Element to query.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Array size of the sub-element.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementArraySize(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementCount: Number of sub-elements
+ *
+ * Elements can be simple, such as an int or a float, or a structure with multiple
+ * sub-elements.  This function returns zero for simple Elements and the number of
+ * sub-elements for complex Elements.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *
+ * Returns: Number of sub-elements.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementCount(rs_element e);
+#endif
+
+/*
+ * rsElementGetSubElementName: Name of a sub-element
+ *
+ * For complex Elements, this function returns the name of the sub-element at the
+ * specified index.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *   name: Address of the array to store the name into.
+ *   nameLength: Length of the provided name array.
+ *
+ * Returns: Number of characters copied, excluding the null terminator.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementName(rs_element e, uint32_t index, char* name, uint32_t nameLength);
+#endif
+
+/*
+ * rsElementGetSubElementNameLength: Length of the name of a sub-element
+ *
+ * For complex Elements, this function returns the length of the name of the sub-element
+ * at the specified index.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Length of the sub-element name including the null terminator.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementNameLength(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetSubElementOffsetBytes: Offset of the instantiated sub-element
+ *
+ * This function returns the relative position of the instantiation of the specified
+ * sub-element within the instantiation of the Element.
+ *
+ * For example, if the Element describes a 32 bit float followed by a 32 bit integer,
+ * the offset return for the first will be 0 and the second 4.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *   index: Index of the sub-element.
+ *
+ * Returns: Offset in bytes.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetSubElementOffsetBytes(rs_element e, uint32_t index);
+#endif
+
+/*
+ * rsElementGetVectorSize: Vector size of the Element
+ *
+ * Returns the Element's vector size.  If the Element does not represent a vector,
+ * 1 is returned.
+ *
+ * Parameters:
+ *   e: Element to get data from.
+ *
+ * Returns: Length of the element vector.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern uint32_t __attribute__((overloadable))
+    rsElementGetVectorSize(rs_element e);
+#endif
+
+/*
+ * rsGetAllocation: Return the Allocation for a given pointer
+ *
+ * DEPRECATED.  Do not use.
+ *
+ * Returns the Allocation for a given pointer.  The pointer should point within a valid
+ * allocation.  The results are undefined if the pointer is not from a valid Allocation.
+ */
+extern rs_allocation __attribute__((overloadable
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
+, deprecated("This function is deprecated and will be removed from the SDK in a future release.")
+#endif
+))
+    rsGetAllocation(const void* p);
+
+/*
+ * rsSamplerGetAnisotropy: Anisotropy of the Sampler
+ *
+ * Get the Sampler's anisotropy.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern float __attribute__((overloadable))
+    rsSamplerGetAnisotropy(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetMagnification: Sampler magnification value
+ *
+ * Get the Sampler's magnification value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMagnification(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetMinification: Sampler minification value
+ *
+ * Get the Sampler's minification value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetMinification(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetWrapS: Sampler wrap S value
+ *
+ * Get the Sampler's wrap S value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapS(rs_sampler s);
+#endif
+
+/*
+ * rsSamplerGetWrapT: Sampler wrap T value
+ *
+ * Get the sampler's wrap T value.
+ *
+ * See android.renderscript.S.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+extern rs_sampler_value __attribute__((overloadable))
+    rsSamplerGetWrapT(rs_sampler s);
+#endif
+
+/*
+ * rsSetObject: For internal use.
+ *
+ */
+extern void __attribute__((overloadable))
+    rsSetObject(rs_element* dst, rs_element src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_type* dst, rs_type src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_allocation* dst, rs_allocation src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_sampler* dst, rs_sampler src);
+
+extern void __attribute__((overloadable))
+    rsSetObject(rs_script* dst, rs_script src);
+
+#endif // RENDERSCRIPT_RS_OBJECT_INFO_RSH

diff --git a/25.0.2/include/rs_object_types.rsh b/25.0.2/include/rs_object_types.rsh
new file mode 100644
index 0000000..e6511a5
--- /dev/null
+++ b/25.0.2/include/rs_object_types.rsh

@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_object_types.rsh: Object Types
+ *
+ * The types below are used to manipulate RenderScript objects like allocations, samplers,
+ * elements, and scripts.  Most of these object are created using the Java RenderScript APIs.
+ */
+
+#ifndef RENDERSCRIPT_RS_OBJECT_TYPES_RSH
+#define RENDERSCRIPT_RS_OBJECT_TYPES_RSH
+
+#define NULL ((void *)0)
+
+// Opaque handle to a RenderScript object. Do not use this directly.
+#ifndef __LP64__
+#define _RS_OBJECT_DECL \
+{\
+  const int* const p;\
+} __attribute__((packed, aligned(4)))
+#else
+#define _RS_OBJECT_DECL \
+{\
+  const long* const p;\
+  const long* const r;\
+  const long* const v1;\
+  const long* const v2;\
+}
+#endif
+
+/*
+ * rs_element: Handle to an element
+ *
+ * An opaque handle to a RenderScript element.
+ *
+ * See android.renderscript.Element.
+ */
+typedef struct rs_element _RS_OBJECT_DECL rs_element;
+
+/*
+ * rs_type: Handle to a Type
+ *
+ * An opaque handle to a RenderScript type.
+ *
+ * See android.renderscript.Type.
+ */
+typedef struct rs_type _RS_OBJECT_DECL rs_type;
+
+/*
+ * rs_allocation: Handle to an allocation
+ *
+ * An opaque handle to a RenderScript allocation.
+ *
+ * See android.renderscript.Allocation.
+ */
+typedef struct rs_allocation _RS_OBJECT_DECL rs_allocation;
+
+/*
+ * rs_sampler: Handle to a Sampler
+ *
+ * An opaque handle to a RenderScript sampler object.
+ *
+ * See android.renderscript.Sampler.
+ */
+typedef struct rs_sampler _RS_OBJECT_DECL rs_sampler;
+
+/*
+ * rs_script: Handle to a Script
+ *
+ * An opaque handle to a RenderScript script object.
+ *
+ * See android.renderscript.ScriptC.
+ */
+typedef struct rs_script _RS_OBJECT_DECL rs_script;
+
+/*
+ * rs_allocation_cubemap_face: Enum for selecting cube map faces
+ *
+ * An enum used to specify one the six faces of a cubemap.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+typedef enum {
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X = 0,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_X = 1,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Y = 2,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Y = 3,
+    RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_Z = 4,
+    RS_ALLOCATION_CUBEMAP_FACE_NEGATIVE_Z = 5
+} rs_allocation_cubemap_face;
+#endif
+
+/*
+ * rs_allocation_usage_type: Bitfield to specify how an allocation is used
+ *
+ * These values are ORed together to specify which usages or memory spaces are
+ * relevant to an allocation or an operation on an allocation.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+typedef enum {
+    RS_ALLOCATION_USAGE_SCRIPT = 0x0001, // Allocation is bound to and accessed by scripts.
+    RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, // Allocation is used as a texture source.
+    RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008, // Deprecated.
+    RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010, // Deprecated.
+    RS_ALLOCATION_USAGE_IO_INPUT = 0x0020, // Allocation is used as a Surface consumer.
+    RS_ALLOCATION_USAGE_IO_OUTPUT = 0x0040, // Allocation is used as a Surface producer.
+    RS_ALLOCATION_USAGE_SHARED = 0x0080 // Allocation's backing store is shared with another object (usually a Bitmap).  Copying to or from the original source Bitmap will cause a synchronization rather than a full copy.
+} rs_allocation_usage_type;
+#endif
+
+/*
+ * rs_data_type: Element basic data type
+ *
+ * rs_data_type is used to encode the type information of a basic element.
+ *
+ * RS_TYPE_UNSIGNED_5_6_5, RS_TYPE_UNSIGNED_5_5_5_1, RS_TYPE_UNSIGNED_4_4_4_4 are for packed
+ * graphical data formats and represent vectors with per vector member sizes which are treated
+ * as a single unit for packing and alignment purposes.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_TYPE_NONE = 0, // Element is a complex type, i.e. a struct.
+    RS_TYPE_FLOAT_16 = 1, // A 16 bit floating point value.
+    RS_TYPE_FLOAT_32 = 2, // A 32 bit floating point value.
+    RS_TYPE_FLOAT_64 = 3, // A 64 bit floating point value.
+    RS_TYPE_SIGNED_8 = 4, // An 8 bit signed integer.
+    RS_TYPE_SIGNED_16 = 5, // A 16 bit signed integer.
+    RS_TYPE_SIGNED_32 = 6, // A 32 bit signed integer.
+    RS_TYPE_SIGNED_64 = 7, // A 64 bit signed integer.
+    RS_TYPE_UNSIGNED_8 = 8, // An 8 bit unsigned integer.
+    RS_TYPE_UNSIGNED_16 = 9, // A 16 bit unsigned integer.
+    RS_TYPE_UNSIGNED_32 = 10, // A 32 bit unsigned integer.
+    RS_TYPE_UNSIGNED_64 = 11, // A 64 bit unsigned integer.
+    RS_TYPE_BOOLEAN = 12, // 0 or 1 (false or true) stored in an 8 bit container.
+    RS_TYPE_UNSIGNED_5_6_5 = 13, // A 16 bit unsigned integer packing graphical data in 5, 6, and 5 bit sections.
+    RS_TYPE_UNSIGNED_5_5_5_1 = 14, // A 16 bit unsigned integer packing graphical data in 5, 5, 5, and 1 bit sections.
+    RS_TYPE_UNSIGNED_4_4_4_4 = 15, // A 16 bit unsigned integer packing graphical data in 4, 4, 4, and 4 bit sections.
+    RS_TYPE_MATRIX_4X4 = 16, // A 4x4 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_MATRIX_3X3 = 17, // A 3x3 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_MATRIX_2X2 = 18, // A 2x2 matrix of 32 bit floats, aligned on a 32 bit boundary.
+    RS_TYPE_ELEMENT = 1000, // A handle to an Element.
+    RS_TYPE_TYPE = 1001, // A handle to a Type.
+    RS_TYPE_ALLOCATION = 1002, // A handle to an Allocation.
+    RS_TYPE_SAMPLER = 1003, // A handle to a Sampler.
+    RS_TYPE_SCRIPT = 1004, // A handle to a Script.
+    RS_TYPE_MESH = 1005, // Deprecated.
+    RS_TYPE_PROGRAM_FRAGMENT = 1006, // Deprecated.
+    RS_TYPE_PROGRAM_VERTEX = 1007, // Deprecated.
+    RS_TYPE_PROGRAM_RASTER = 1008, // Deprecated.
+    RS_TYPE_PROGRAM_STORE = 1009, // Deprecated.
+    RS_TYPE_FONT = 1010, // Deprecated.
+    RS_TYPE_INVALID = 10000
+} rs_data_type;
+#endif
+
+/*
+ * rs_data_kind: Element data kind
+ *
+ * This enumeration is primarly useful for graphical data.  It provides additional information to
+ * help interpret the rs_data_type.
+ *
+ * RS_KIND_USER indicates no special interpretation is expected.
+ *
+ * The RS_KIND_PIXEL_* values are used in conjunction with the standard data types for representing
+ * texture formats.
+ *
+ * See the Element.createPixel() method.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_KIND_USER         = 0, // No special interpretation.
+    RS_KIND_PIXEL_L      = 7, // Luminance.
+    RS_KIND_PIXEL_A      = 8, // Alpha.
+    RS_KIND_PIXEL_LA     = 9, // Luminance and Alpha.
+    RS_KIND_PIXEL_RGB    = 10, // Red, Green, Blue.
+    RS_KIND_PIXEL_RGBA   = 11, // Red, Green, Blue, and Alpha.
+    RS_KIND_PIXEL_DEPTH  = 12, // Depth for a depth texture.
+    RS_KIND_PIXEL_YUV    = 13, // Luminance and chrominance.
+    RS_KIND_INVALID      = 100
+} rs_data_kind;
+#endif
+
+/*
+ * rs_yuv_format: YUV format
+ *
+ *  Android YUV formats that can be associated with a RenderScript Type.
+ *
+ *  See android.graphics.ImageFormat for a description of each format.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+typedef enum {
+    RS_YUV_NONE = 0,
+    RS_YUV_YV12 = 0x32315659,
+    RS_YUV_NV21 = 0x11,
+    RS_YUV_420_888 = 0x23
+} rs_yuv_format;
+#endif
+
+/*
+ * rs_sampler_value: Sampler wrap T value
+ *
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 16))
+typedef enum {
+    RS_SAMPLER_NEAREST = 0,
+    RS_SAMPLER_LINEAR = 1,
+    RS_SAMPLER_LINEAR_MIP_LINEAR = 2,
+    RS_SAMPLER_WRAP = 3,
+    RS_SAMPLER_CLAMP = 4,
+    RS_SAMPLER_LINEAR_MIP_NEAREST = 5,
+    RS_SAMPLER_MIRRORED_REPEAT = 6,
+    RS_SAMPLER_INVALID = 100
+} rs_sampler_value;
+#endif
+
+#endif // RENDERSCRIPT_RS_OBJECT_TYPES_RSH

diff --git a/25.0.2/include/rs_quaternion.rsh b/25.0.2/include/rs_quaternion.rsh
new file mode 100644
index 0000000..55d33cf
--- /dev/null
+++ b/25.0.2/include/rs_quaternion.rsh

@@ -0,0 +1,374 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_quaternion.rsh: Quaternion Functions
+ *
+ * The following functions manipulate quaternions.
+ */
+
+#ifndef RENDERSCRIPT_RS_QUATERNION_RSH
+#define RENDERSCRIPT_RS_QUATERNION_RSH
+
+/*
+ * rsQuaternionAdd: Add two quaternions
+ *
+ * Adds two quaternions, i.e. *q += *rhs;
+ *
+ * Parameters:
+ *   q: Destination quaternion to add to.
+ *   rhs: Quaternion to add.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionAdd(rs_quaternion* q, const rs_quaternion* rhs) {
+    q->w += rhs->w;
+    q->x += rhs->x;
+    q->y += rhs->y;
+    q->z += rhs->z;
+}
+#endif
+
+/*
+ * rsQuaternionConjugate: Conjugate a quaternion
+ *
+ * Conjugates the quaternion.
+ *
+ * Parameters:
+ *   q: Quaternion to modify.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionConjugate(rs_quaternion* q) {
+    q->x = -q->x;
+    q->y = -q->y;
+    q->z = -q->z;
+}
+#endif
+
+/*
+ * rsQuaternionDot: Dot product of two quaternions
+ *
+ * Returns the dot product of two quaternions.
+ *
+ * Parameters:
+ *   q0: First quaternion.
+ *   q1: Second quaternion.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline float __attribute__((overloadable))
+    rsQuaternionDot(const rs_quaternion* q0, const rs_quaternion* q1) {
+    return q0->w*q1->w + q0->x*q1->x + q0->y*q1->y + q0->z*q1->z;
+}
+#endif
+
+/*
+ * rsQuaternionGetMatrixUnit: Get a rotation matrix from a quaternion
+ *
+ * Computes a rotation matrix from the normalized quaternion.
+ *
+ * Parameters:
+ *   m: Resulting matrix.
+ *   q: Normalized quaternion.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionGetMatrixUnit(rs_matrix4x4* m, const rs_quaternion* q) {
+    float xx = q->x * q->x;
+    float xy = q->x * q->y;
+    float xz = q->x * q->z;
+    float xw = q->x * q->w;
+    float yy = q->y * q->y;
+    float yz = q->y * q->z;
+    float yw = q->y * q->w;
+    float zz = q->z * q->z;
+    float zw = q->z * q->w;
+
+    m->m[0]  = 1.0f - 2.0f * ( yy + zz );
+    m->m[4]  =        2.0f * ( xy - zw );
+    m->m[8]  =        2.0f * ( xz + yw );
+    m->m[1]  =        2.0f * ( xy + zw );
+    m->m[5]  = 1.0f - 2.0f * ( xx + zz );
+    m->m[9]  =        2.0f * ( yz - xw );
+    m->m[2]  =        2.0f * ( xz - yw );
+    m->m[6]  =        2.0f * ( yz + xw );
+    m->m[10] = 1.0f - 2.0f * ( xx + yy );
+    m->m[3]  = m->m[7] = m->m[11] = m->m[12] = m->m[13] = m->m[14] = 0.0f;
+    m->m[15] = 1.0f;
+}
+#endif
+
+/*
+ * rsQuaternionLoadRotateUnit: Quaternion that represents a rotation about an arbitrary unit vector
+ *
+ * Loads a quaternion that represents a rotation about an arbitrary unit vector.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   rot: Angle to rotate by, in radians.
+ *   x: X component of the vector.
+ *   y: Y component of the vector.
+ *   z: Z component of the vector.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z) {
+    rot *= (float)(M_PI / 180.0f) * 0.5f;
+    float c = cos(rot);
+    float s = sin(rot);
+
+    q->w = c;
+    q->x = x * s;
+    q->y = y * s;
+    q->z = z * s;
+}
+#endif
+
+/*
+ * rsQuaternionSet: Create a quaternion
+ *
+ * Creates a quaternion from its four components or from another quaternion.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   w: W component.
+ *   x: X component.
+ *   y: Y component.
+ *   z: Z component.
+ *   rhs: Source quaternion.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z) {
+    q->w = w;
+    q->x = x;
+    q->y = y;
+    q->z = z;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, const rs_quaternion* rhs) {
+    q->w = rhs->w;
+    q->x = rhs->x;
+    q->y = rhs->y;
+    q->z = rhs->z;
+}
+#endif
+
+/*
+ * rsQuaternionLoadRotate: Create a rotation quaternion
+ *
+ * Loads a quaternion that represents a rotation about an arbitrary vector
+ * (doesn't have to be unit)
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   rot: Angle to rotate by.
+ *   x: X component of a vector.
+ *   y: Y component of a vector.
+ *   z: Z component of a vector.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z) {
+    const float len = x*x + y*y + z*z;
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        x *= recipLen;
+        y *= recipLen;
+        z *= recipLen;
+    }
+    rsQuaternionLoadRotateUnit(q, rot, x, y, z);
+}
+#endif
+
+/*
+ * rsQuaternionNormalize: Normalize a quaternion
+ *
+ * Normalizes the quaternion.
+ *
+ * Parameters:
+ *   q: Quaternion to normalize.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionNormalize(rs_quaternion* q) {
+    const float len = rsQuaternionDot(q, q);
+    if (len != 1) {
+        const float recipLen = 1.f / sqrt(len);
+        q->w *= recipLen;
+        q->x *= recipLen;
+        q->y *= recipLen;
+        q->z *= recipLen;
+    }
+}
+#endif
+
+/*
+ * rsQuaternionMultiply: Multiply a quaternion by a scalar or another quaternion
+ *
+ * Multiplies a quaternion by a scalar or by another quaternion, e.g
+ * *q = *q * scalar; or *q = *q * *rhs;.
+ *
+ * Parameters:
+ *   q: Destination quaternion.
+ *   scalar: Scalar to multiply the quaternion by.
+ *   rhs: Quaternion to multiply the destination quaternion by.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, float scalar) {
+    q->w *= scalar;
+    q->x *= scalar;
+    q->y *= scalar;
+    q->z *= scalar;
+}
+#endif
+
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, const rs_quaternion* rhs) {
+    rs_quaternion qtmp;
+    rsQuaternionSet(&qtmp, q);
+
+    q->w = qtmp.w*rhs->w - qtmp.x*rhs->x - qtmp.y*rhs->y - qtmp.z*rhs->z;
+    q->x = qtmp.w*rhs->x + qtmp.x*rhs->w + qtmp.y*rhs->z - qtmp.z*rhs->y;
+    q->y = qtmp.w*rhs->y + qtmp.y*rhs->w + qtmp.z*rhs->x - qtmp.x*rhs->z;
+    q->z = qtmp.w*rhs->z + qtmp.z*rhs->w + qtmp.x*rhs->y - qtmp.y*rhs->x;
+    rsQuaternionNormalize(q);
+}
+#endif
+
+/*
+ * rsQuaternionSlerp: Spherical linear interpolation between two quaternions
+ *
+ * Performs spherical linear interpolation between two quaternions.
+ *
+ * Parameters:
+ *   q: Result quaternion from the interpolation.
+ *   q0: First input quaternion.
+ *   q1: Second input quaternion.
+ *   t: How much to interpolate by.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 23)
+static inline void __attribute__((overloadable))
+    rsQuaternionSlerp(rs_quaternion* q, const rs_quaternion* q0, const rs_quaternion* q1, float t) {
+    if (t <= 0.0f) {
+        rsQuaternionSet(q, q0);
+        return;
+    }
+    if (t >= 1.0f) {
+        rsQuaternionSet(q, q1);
+        return;
+    }
+
+    rs_quaternion tempq0, tempq1;
+    rsQuaternionSet(&tempq0, q0);
+    rsQuaternionSet(&tempq1, q1);
+
+    float angle = rsQuaternionDot(q0, q1);
+    if (angle < 0) {
+        rsQuaternionMultiply(&tempq0, -1.0f);
+        angle *= -1.0f;
+    }
+
+    float scale, invScale;
+    if (angle + 1.0f > 0.05f) {
+        if (1.0f - angle >= 0.05f) {
+            float theta = acos(angle);
+            float invSinTheta = 1.0f / sin(theta);
+            scale = sin(theta * (1.0f - t)) * invSinTheta;
+            invScale = sin(theta * t) * invSinTheta;
+        } else {
+            scale = 1.0f - t;
+            invScale = t;
+        }
+    } else {
+        rsQuaternionSet(&tempq1, tempq0.z, -tempq0.y, tempq0.x, -tempq0.w);
+        scale = sin(M_PI * (0.5f - t));
+        invScale = sin(M_PI * t);
+    }
+
+    rsQuaternionSet(q, tempq0.w*scale + tempq1.w*invScale, tempq0.x*scale + tempq1.x*invScale,
+                        tempq0.y*scale + tempq1.y*invScale, tempq0.z*scale + tempq1.z*invScale);
+}
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionAdd(rs_quaternion* q, const rs_quaternion* rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionConjugate(rs_quaternion* q);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern float __attribute__((overloadable))
+    rsQuaternionDot(const rs_quaternion* q0, const rs_quaternion* q1);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionGetMatrixUnit(rs_matrix4x4* m, const rs_quaternion* q);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionSet(rs_quaternion* q, const rs_quaternion* rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionNormalize(rs_quaternion* q);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, float scalar);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionMultiply(rs_quaternion* q, const rs_quaternion* rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern void __attribute__((overloadable))
+    rsQuaternionSlerp(rs_quaternion* q, const rs_quaternion* q0, const rs_quaternion* q1, float t);
+#endif
+
+#endif // RENDERSCRIPT_RS_QUATERNION_RSH

diff --git a/25.0.2/include/rs_time.rsh b/25.0.2/include/rs_time.rsh
new file mode 100644
index 0000000..6c0eeb0
--- /dev/null
+++ b/25.0.2/include/rs_time.rsh

@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_time.rsh: Time Functions and Types
+ *
+ * The functions below can be used to tell the current clock time and the current
+ * system up time.  It is not recommended to call these functions inside of a kernel.
+ */
+
+#ifndef RENDERSCRIPT_RS_TIME_RSH
+#define RENDERSCRIPT_RS_TIME_RSH
+
+/*
+ * rs_time_t: Seconds since January 1, 1970
+ *
+ * Calendar time interpreted as seconds elapsed since the Epoch (00:00:00 on
+ * January 1, 1970, Coordinated Universal Time (UTC)).
+ */
+#ifndef __LP64__
+typedef int rs_time_t;
+#endif
+
+#ifdef __LP64__
+typedef long rs_time_t;
+#endif
+
+/*
+ * rs_tm: Date and time structure
+ *
+ * Data structure for broken-down time components.
+ */
+typedef struct {
+    int tm_sec; // Seconds after the minute. This ranges from 0 to 59, but possibly up to 60 for leap seconds.
+    int tm_min; // Minutes after the hour. This ranges from 0 to 59.
+    int tm_hour; // Hours past midnight. This ranges from 0 to 23.
+    int tm_mday; // Day of the month. This ranges from 1 to 31.
+    int tm_mon; // Months since January. This ranges from 0 to 11.
+    int tm_year; // Years since 1900.
+    int tm_wday; // Days since Sunday. This ranges from 0 to 6.
+    int tm_yday; // Days since January 1. This ranges from 0 to 365.
+    int tm_isdst; // Flag to indicate whether daylight saving time is in effect. The value is positive if it is in effect, zero if it is not, and negative if the information is not available.
+} rs_tm;
+
+/*
+ * rsGetDt: Elapsed time since last call
+ *
+ * Returns the time in seconds since this function was last called in this script.
+ *
+ * Returns: Time in seconds.
+ */
+extern float __attribute__((overloadable))
+    rsGetDt(void);
+
+/*
+ * rsLocaltime: Convert to local time
+ *
+ * Converts the time specified by timer into a rs_tm structure that provides year, month,
+ * hour, etc.  This value is stored at *local.
+ *
+ * This functions returns the same pointer that is passed as first argument.  If the
+ * local parameter is NULL, this function does nothing and returns NULL.
+ *
+ * Parameters:
+ *   local: Pointer to time structure where the local time will be stored.
+ *   timer: Input time as a number of seconds since January 1, 1970.
+ *
+ * Returns: Pointer to the output local time, i.e. the same value as the parameter local.
+ */
+extern rs_tm* __attribute__((overloadable))
+    rsLocaltime(rs_tm* local, const rs_time_t* timer);
+
+/*
+ * rsTime: Seconds since January 1, 1970
+ *
+ * Returns the number of seconds since the Epoch (00:00:00 UTC, January 1, 1970).
+ *
+ * If timer is non-NULL, the result is also stored in the memory pointed to by
+ * this variable.
+ *
+ * Parameters:
+ *   timer: Location to also store the returned calendar time.
+ *
+ * Returns: Seconds since the Epoch, -1 if there's an error.
+ */
+extern rs_time_t __attribute__((overloadable))
+    rsTime(rs_time_t* timer);
+
+/*
+ * rsUptimeMillis: System uptime in milliseconds
+ *
+ * Returns the current system clock (uptime) in milliseconds.
+ *
+ * Returns: Uptime in milliseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeMillis(void);
+
+/*
+ * rsUptimeNanos: System uptime in nanoseconds
+ *
+ * Returns the current system clock (uptime) in nanoseconds.
+ *
+ * The granularity of the values return by this call may be much larger than a nanosecond.
+ *
+ * Returns: Uptime in nanoseconds.
+ */
+extern int64_t __attribute__((overloadable))
+    rsUptimeNanos(void);
+
+#endif // RENDERSCRIPT_RS_TIME_RSH

diff --git a/25.0.2/include/rs_value_types.rsh b/25.0.2/include/rs_value_types.rsh
new file mode 100644
index 0000000..180b297
--- /dev/null
+++ b/25.0.2/include/rs_value_types.rsh

@@ -0,0 +1,543 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_value_types.rsh: Numerical Types
+ *
+ * Scalars:
+ *
+ * RenderScript supports the following scalar numerical types:
+ *
+ *                    8 bits           16 bits            32 bits          64 bits
+ * Integer:           char, int8_t     short, int16_t     int32_t          long, long long, int64_t
+ * Unsigned integer:  uchar, uint8_t   ushort, uint16_t   uint, uint32_t   ulong, uint64_t
+ * Floating point:                     half               float            double
+ *
+ *
+ * Vectors:
+ *
+ * RenderScript supports fixed size vectors of length 2, 3, and 4.
+ * Vectors are declared using the common type name followed by a 2, 3, or 4.
+ * E.g. float4, int3, double2, ulong4.
+ *
+ * To create vector literals, use the vector type followed by the values enclosed
+ * between curly braces, e.g. (float3){1.0f, 2.0f, 3.0f}.
+ *
+ * Entries of a vector can be accessed using different naming styles.
+ *
+ * Single entries can be accessed by following the variable name with a dot and:
+ * - The letters x, y, z, and w,
+ * - The letters r, g, b, and a,
+ * - The letter s or S, followed by a zero based index.
+ *
+ * For example, with int4 myVar; the following are equivalent:
+ *   myVar.x == myVar.r == myVar.s0 == myVar.S0
+ *   myVar.y == myVar.g == myVar.s1 == myVar.S1
+ *   myVar.z == myVar.b == myVar.s2 == myVar.S2
+ *   myVar.w == myVar.a == myVar.s3 == myVar.S3
+ *
+ * Multiple entries of a vector can be accessed at once by using an identifier that is
+ * the concatenation of multiple letters or indices.  The resulting vector has a size
+ * equal to the number of entries named.
+ *
+ * With the example above, the middle two entries can be accessed using
+ * myVar.yz, myVar.gb, myVar.s12, and myVar.S12.
+ *
+ * The entries don't have to be contiguous or in increasing order.  Entries can even be
+ * repeated, as long as we're not trying to assign to it.  You also can't mix the naming
+ * styles.
+ *
+ * Here are examples of what can or can't be done:
+ * float4 v4;
+ * float3 v3;
+ * float2 v2;
+ * v2 = v4.xx; // Valid
+ * v3 = v4.zxw; // Valid
+ * v3 = v4.bba; // Valid
+ * v3 = v4.s032; // Valid
+ * v3.s120 = v4.S233; // Valid
+ * v4.yz = v3.rg; // Valid
+ * v4.yzx = v3.rg; // Invalid: mismatched sizes
+ * v4.yzz = v3; // Invalid: z appears twice in an assignment
+ * v3 = v3.xas0; // Invalid: can't mix xyzw with rgba nor s0...
+ * v3 = v4.s034; // Invalid: the digit can only be 0, 1, 2, or 3
+ *
+ *
+ * Matrices and Quaternions:
+ *
+ * RenderScript supports fixed size square matrices of floats of size 2x2, 3x3, and 4x4.
+ * The types are named rs_matrix2x2, rs_matrix3x3, and rs_matrix4x4.  See
+ * Matrix Functions for the list of operations.
+ *
+ * Quaternions are also supported via rs_quaternion.  See Quaterion Functions for the list
+ * of operations.
+ */
+
+#ifndef RENDERSCRIPT_RS_VALUE_TYPES_RSH
+#define RENDERSCRIPT_RS_VALUE_TYPES_RSH
+
+/*
+ * half: 16 bit floating point value
+ *
+ * A 16 bit floating point value.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef __fp16 half;
+#endif
+
+/*
+ * half2: Two 16 bit floats
+ *
+ * Vector version of the half float type. Provides two half fields packed
+ * into a single 32 bit field with 32 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(2))) half2;
+#endif
+
+/*
+ * half3: Three 16 bit floats
+ *
+ * Vector version of the half float type. Provides three half fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(3))) half3;
+#endif
+
+/*
+ * half4: Four 16 bit floats
+ *
+ * Vector version of the half float type. Provides four half fields packed
+ * into a single 64 bit field with 64 bit alignment.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 23))
+typedef half __attribute__((ext_vector_type(4))) half4;
+#endif
+
+/*
+ * int8_t: 8 bit signed integer
+ *
+ * 8 bit signed integer type.
+ */
+typedef char int8_t;
+
+/*
+ * int16_t: 16 bit signed integer
+ *
+ * A 16 bit signed integer type.
+ */
+typedef short int16_t;
+
+/*
+ * int32_t: 32 bit signed integer
+ *
+ * A 32 bit signed integer type.
+ */
+typedef int int32_t;
+
+/*
+ * int64_t: 64 bit signed integer
+ *
+ * A 64 bit signed integer type.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+typedef long long int64_t;
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+typedef long int64_t;
+#endif
+
+/*
+ * uint8_t: 8 bit unsigned integer
+ *
+ * 8 bit unsigned integer type.
+ */
+typedef unsigned char uint8_t;
+
+/*
+ * uint16_t: 16 bit unsigned integer
+ *
+ * A 16 bit unsigned integer type.
+ */
+typedef unsigned short uint16_t;
+
+/*
+ * uint32_t: 32 bit unsigned integer
+ *
+ * A 32 bit unsigned integer type.
+ */
+typedef unsigned int uint32_t;
+
+/*
+ * uint64_t: 64 bit unsigned integer
+ *
+ * A 64 bit unsigned integer type.
+ */
+#if !defined(RS_VERSION) || (RS_VERSION <= 20)
+typedef unsigned long long uint64_t;
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+typedef unsigned long uint64_t;
+#endif
+
+/*
+ * uchar: 8 bit unsigned integer
+ *
+ * 8 bit unsigned integer type.
+ */
+typedef uint8_t uchar;
+
+/*
+ * ushort: 16 bit unsigned integer
+ *
+ * A 16 bit unsigned integer type.
+ */
+typedef uint16_t ushort;
+
+/*
+ * uint: 32 bit unsigned integer
+ *
+ * A 32 bit unsigned integer type.
+ */
+typedef uint32_t uint;
+
+/*
+ * ulong: 64 bit unsigned integer
+ *
+ * A 64 bit unsigned integer type.
+ */
+typedef uint64_t ulong;
+
+/*
+ * size_t: Unsigned size type
+ *
+ * Unsigned size type.  The number of bits depend on the compilation flags.
+ */
+#ifdef __LP64__
+typedef uint64_t size_t;
+#endif
+
+#ifndef __LP64__
+typedef uint32_t size_t;
+#endif
+
+/*
+ * ssize_t: Signed size type
+ *
+ * Signed size type.  The number of bits depend on the compilation flags.
+ */
+#ifdef __LP64__
+typedef int64_t ssize_t;
+#endif
+
+#ifndef __LP64__
+typedef int32_t ssize_t;
+#endif
+
+/*
+ * float2: Two 32 bit floats
+ *
+ * A vector of two floats.  These two floats are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ *
+ * A vector of two floats.  These two floats are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(2))) float2;
+
+/*
+ * float3: Three 32 bit floats
+ *
+ * A vector of three floats.  These three floats are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(3))) float3;
+
+/*
+ * float4: Four 32 bit floats
+ *
+ * A vector of four floats type.  These four floats are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef float __attribute__((ext_vector_type(4))) float4;
+
+/*
+ * double2: Two 64 bit floats
+ *
+ * A vector of two doubles.  These two double fields packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(2))) double2;
+
+/*
+ * double3: Three 64 bit floats
+ *
+ * A vector of three doubles.  These three double fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(3))) double3;
+
+/*
+ * double4: Four 64 bit floats
+ *
+ * A vector of four doubles.  These four double fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef double __attribute__((ext_vector_type(4))) double4;
+
+/*
+ * uchar2: Two 8 bit unsigned integers
+ *
+ * A vector of two uchars.  These two uchar fields packed into a single 16 bit field
+ * with a 16 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(2))) uchar2;
+
+/*
+ * uchar3: Three 8 bit unsigned integers
+ *
+ * A vector of three uchars.  These three uchar fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(3))) uchar3;
+
+/*
+ * uchar4: Four 8 bit unsigned integers
+ *
+ * A vector of four uchars.  These four uchar fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef uchar __attribute__((ext_vector_type(4))) uchar4;
+
+/*
+ * ushort2: Two 16 bit unsigned integers
+ *
+ * A vector of two ushorts.  These two ushort fields packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(2))) ushort2;
+
+/*
+ * ushort3: Three 16 bit unsigned integers
+ *
+ * A vector of three ushorts.  These three ushort fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(3))) ushort3;
+
+/*
+ * ushort4: Four 16 bit unsigned integers
+ *
+ * A vector of four ushorts.  These four ushort fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef ushort __attribute__((ext_vector_type(4))) ushort4;
+
+/*
+ * uint2: Two 32 bit unsigned integers
+ *
+ * A vector of two uints.  These two uints are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(2))) uint2;
+
+/*
+ * uint3: Three 32 bit unsigned integers
+ *
+ * A vector of three uints.  These three uints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(3))) uint3;
+
+/*
+ * uint4: Four 32 bit unsigned integers
+ *
+ * A vector of four uints.  These four uints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef uint __attribute__((ext_vector_type(4))) uint4;
+
+/*
+ * ulong2: Two 64 bit unsigned integers
+ *
+ * A vector of two ulongs.  These two ulongs are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(2))) ulong2;
+
+/*
+ * ulong3: Three 64 bit unsigned integers
+ *
+ * A vector of three ulongs.  These three ulong fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(3))) ulong3;
+
+/*
+ * ulong4: Four 64 bit unsigned integers
+ *
+ * A vector of four ulongs.  These four ulong fields packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef ulong __attribute__((ext_vector_type(4))) ulong4;
+
+/*
+ * char2: Two 8 bit signed integers
+ *
+ * A vector of two chars.  These two chars are packed into a single 16 bit field
+ * with a 16 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(2))) char2;
+
+/*
+ * char3: Three 8 bit signed integers
+ *
+ * A vector of three chars.  These three chars are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(3))) char3;
+
+/*
+ * char4: Four 8 bit signed integers
+ *
+ * A vector of four chars.  These four chars are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef char __attribute__((ext_vector_type(4))) char4;
+
+/*
+ * short2: Two 16 bit signed integers
+ *
+ * A vector of two shorts.  These two shorts are packed into a single 32 bit field
+ * with a 32 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(2))) short2;
+
+/*
+ * short3: Three 16 bit signed integers
+ *
+ * A vector of three shorts.  These three short fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(3))) short3;
+
+/*
+ * short4: Four 16 bit signed integers
+ *
+ * A vector of four shorts.  These four short fields packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef short __attribute__((ext_vector_type(4))) short4;
+
+/*
+ * int2: Two 32 bit signed integers
+ *
+ * A vector of two ints.  These two ints are packed into a single 64 bit field
+ * with a 64 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(2))) int2;
+
+/*
+ * int3: Three 32 bit signed integers
+ *
+ * A vector of three ints.  These three ints are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(3))) int3;
+
+/*
+ * int4: Four 32 bit signed integers
+ *
+ * A vector of four ints.  These two fours are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef int __attribute__((ext_vector_type(4))) int4;
+
+/*
+ * long2: Two 64 bit signed integers
+ *
+ * A vector of two longs.  These two longs are packed into a single 128 bit field
+ * with a 128 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(2))) long2;
+
+/*
+ * long3: Three 64 bit signed integers
+ *
+ * A vector of three longs.  These three longs are packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(3))) long3;
+
+/*
+ * long4: Four 64 bit signed integers
+ *
+ * A vector of four longs.  These four longs are packed into a single 256 bit field
+ * with a 256 bit alignment.
+ */
+typedef long __attribute__((ext_vector_type(4))) long4;
+
+/*
+ * rs_matrix2x2: 2x2 matrix of 32 bit floats
+ *
+ * A square 2x2 matrix of floats.  The entries are stored in the array at the
+ * location [row*2 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[4];
+} rs_matrix2x2;
+
+/*
+ * rs_matrix3x3: 3x3 matrix of 32 bit floats
+ *
+ * A square 3x3 matrix of floats.  The entries are stored in the array at the
+ * location [row*3 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[9];
+} rs_matrix3x3;
+
+/*
+ * rs_matrix4x4: 4x4 matrix of 32 bit floats
+ *
+ * A square 4x4 matrix of floats.  The entries are stored in the array at the
+ * location [row*4 + col].
+ *
+ * See Matrix Functions.
+ */
+typedef struct {
+    float m[16];
+} rs_matrix4x4;
+
+/*
+ * rs_quaternion: Quaternion
+ *
+ * A square 4x4 matrix of floats that represents a quaternion.
+ *
+ * See Quaternion Functions.
+ */
+typedef float4 rs_quaternion;
+
+#endif // RENDERSCRIPT_RS_VALUE_TYPES_RSH

diff --git a/25.0.2/include/rs_vector_math.rsh b/25.0.2/include/rs_vector_math.rsh
new file mode 100644
index 0000000..2f5e8e7
--- /dev/null
+++ b/25.0.2/include/rs_vector_math.rsh

@@ -0,0 +1,453 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Don't edit this file!  It is auto-generated by frameworks/rs/api/generate.sh.
+
+/*
+ * rs_vector_math.rsh: Vector Math Functions
+ *
+ * These functions interpret the input arguments as representation of vectors in
+ * n-dimensional space.
+ *
+ * The precision of the mathematical operations on 32 bit floats is affected by the pragmas
+ * rs_fp_relaxed and rs_fp_full.  See Mathematical Constants and Functions for details.
+ *
+ * Different precision/speed tradeoffs can be achieved by using variants of the common math
+ * functions.  Functions with a name starting with
+ * - native_: May have custom hardware implementations with weaker precision.  Additionally,
+ *   subnormal values may be flushed to zero, rounding towards zero may be used, and NaN and
+ *   infinity input may not be handled correctly.
+ * - fast_: May perform internal computations using 16 bit floats.  Additionally, subnormal
+ *   values may be flushed to zero, and rounding towards zero may be used.
+ *
+ */
+
+#ifndef RENDERSCRIPT_RS_VECTOR_MATH_RSH
+#define RENDERSCRIPT_RS_VECTOR_MATH_RSH
+
+/*
+ * cross: Cross product of two vectors
+ *
+ * Computes the cross product of two vectors.
+ */
+extern float3 __attribute__((const, overloadable))
+    cross(float3 left_vector, float3 right_vector);
+
+extern float4 __attribute__((const, overloadable))
+    cross(float4 left_vector, float4 right_vector);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    cross(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    cross(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * distance: Distance between two points
+ *
+ * Compute the distance between two points.
+ *
+ * See also fast_distance(), native_distance().
+ */
+extern float __attribute__((const, overloadable))
+    distance(float left_vector, float right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float2 left_vector, float2 right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float3 left_vector, float3 right_vector);
+
+extern float __attribute__((const, overloadable))
+    distance(float4 left_vector, float4 right_vector);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    distance(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * dot: Dot product of two vectors
+ *
+ * Computes the dot product of two vectors.
+ */
+extern float __attribute__((const, overloadable))
+    dot(float left_vector, float right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float2 left_vector, float2 right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float3 left_vector, float3 right_vector);
+
+extern float __attribute__((const, overloadable))
+    dot(float4 left_vector, float4 right_vector);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    dot(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * fast_distance: Approximate distance between two points
+ *
+ * Computes the approximate distance between two points.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also distance(), native_distance().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_distance(float4 left_vector, float4 right_vector);
+#endif
+
+/*
+ * fast_length: Approximate length of a vector
+ *
+ * Computes the approximate length of a vector.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also length(), native_length().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_length(float4 v);
+#endif
+
+/*
+ * fast_normalize: Approximate normalized vector
+ *
+ * Approximately normalizes a vector.
+ *
+ * For vectors of size 1, returns -1.f for negative values, 0.f for null values, and 1.f for
+ * positive values.
+ *
+ * The precision is what would be expected from doing the computation using 16 bit floating
+ * point values.
+ *
+ * See also normalize(), native_normalize().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float __attribute__((const, overloadable))
+    fast_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float2 __attribute__((const, overloadable))
+    fast_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float3 __attribute__((const, overloadable))
+    fast_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 17))
+extern float4 __attribute__((const, overloadable))
+    fast_normalize(float4 v);
+#endif
+
+/*
+ * length: Length of a vector
+ *
+ * Computes the length of a vector.
+ *
+ * See also fast_length(), native_length().
+ */
+extern float __attribute__((const, overloadable))
+    length(float v);
+
+extern float __attribute__((const, overloadable))
+    length(float2 v);
+
+extern float __attribute__((const, overloadable))
+    length(float3 v);
+
+extern float __attribute__((const, overloadable))
+    length(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    length(half4 v);
+#endif
+
+/*
+ * native_distance: Approximate distance between two points
+ *
+ * Computes the approximate distance between two points.
+ *
+ * See also distance(), fast_distance().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float left_vector, float right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float2 left_vector, float2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float3 left_vector, float3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_distance(float4 left_vector, float4 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_distance(half4 left_vector, half4 right_vector);
+#endif
+
+/*
+ * native_length: Approximate length of a vector
+ *
+ * Compute the approximate length of a vector.
+ *
+ * See also length(), fast_length().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_length(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_length(half4 v);
+#endif
+
+/*
+ * native_normalize: Approximately normalize a vector
+ *
+ * Approximately normalizes a vector.
+ *
+ * See also normalize(), fast_normalize().
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float __attribute__((const, overloadable))
+    native_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float2 __attribute__((const, overloadable))
+    native_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float3 __attribute__((const, overloadable))
+    native_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+extern float4 __attribute__((const, overloadable))
+    native_normalize(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    native_normalize(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    native_normalize(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    native_normalize(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    native_normalize(half4 v);
+#endif
+
+/*
+ * normalize: Normalize a vector
+ *
+ * Normalize a vector.
+ *
+ * For vectors of size 1, returns -1.f for negative values, 0.f for null values, and 1.f for
+ * positive values.
+ *
+ * See also fast_normalize(), native_normalize().
+ */
+extern float __attribute__((const, overloadable))
+    normalize(float v);
+
+extern float2 __attribute__((const, overloadable))
+    normalize(float2 v);
+
+extern float3 __attribute__((const, overloadable))
+    normalize(float3 v);
+
+extern float4 __attribute__((const, overloadable))
+    normalize(float4 v);
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+    normalize(half v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+    normalize(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+    normalize(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+    normalize(half4 v);
+#endif
+
+#endif // RENDERSCRIPT_RS_VECTOR_MATH_RSH

diff --git a/25.0.2/lib64/libLLVM.so b/25.0.2/lib64/libLLVM.so
new file mode 100755
index 0000000..26785a1
--- /dev/null
+++ b/25.0.2/lib64/libLLVM.so
Binary files differ

diff --git a/25.0.2/lib64/libc++.so b/25.0.2/lib64/libc++.so
new file mode 100755
index 0000000..fa84063
--- /dev/null
+++ b/25.0.2/lib64/libc++.so
Binary files differ

diff --git a/25.0.2/lib64/libclang.so b/25.0.2/lib64/libclang.so
new file mode 100755
index 0000000..3a59df1
--- /dev/null
+++ b/25.0.2/lib64/libclang.so
Binary files differ
commit	68a0a1ddacb81c97d718f46ad464a3851d0b67af	[log] [tgz]
author	Arthur Eubanks <aeubanks@google.com>	Thu May 11 11:44:34 2017 -0700
committer	Arthur Eubanks <aeubanks@google.com>	Thu May 11 11:44:34 2017 -0700
tree	7d5802a461720a0329c78549e91f12c902baf8af
parent	bad4a4a7b6a3d0e77bfb2e30c43f68a3f681d245 [diff]